// =========================================================================== // mm_distr_object_sched_id() Finds out which scheduler is responsible for // a given object. This is approximate: it // indicates the direction to be searched // (one level up or one level down). A query // must be made at that level to learn more. // =========================================================================== // * INPUTS // void *ptr The object we're searching for // // * RETURN VALUE // int scheduler_id on success // ERR_OUT_OF_RANGE: pointer does not exist // anywhere in the system // =========================================================================== int mm_distr_object_sched_id(void *ptr) { Context *context; MmRgnAdrRange *range; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(ptr); ar_assert(context->mm_used_ranges); // See if pointer belongs to our scheduler subtree. We have to check that, // because direction of search initially begins upwards (from child // scheduler to parent scheduler) but later on, if it passes the top-level, // it becomes downwards (from parent scheduler to child scheduler) until it // reaches the correct scheduler. kt_trie_find_approx(context->mm_used_ranges, 0, (size_t) ptr, (void *) &range); if (range) { // We found something, but does it contain the pointer? ar_assert(range->address <= (size_t) ptr); if ((size_t) ptr >= range->address + range->num_slabs * MM_SLAB_SIZE) { range = NULL; } } // We know nothing about it if (!range) { // Are we the top-level scheduler? if (context->pr_parent_sched_id == -1) { // No such pointer exists anywhere in the system return ERR_OUT_OF_RANGE; } // Otherwise, our parent scheduler should know else { return context->pr_parent_sched_id; } } // Region belongs to our subtree else { // Return appropriate child scheduler return range->sched_id; } }
// =========================================================================== // mm_distr_region_sched_id() Finds out to which scheduler is responsible // for a given region. This is approximate: it // indicates the direction to be searched // (one level up or one level down). A query // must be made at that level to learn more. // =========================================================================== // * INPUTS // rid_t region The region ID we're searching for // // * RETURN VALUE // int scheduler_id on success // ERR_NO_SUCH_REGION: region does not exist // anywhere in the system // =========================================================================== int mm_distr_region_sched_id(rid_t region) { Context *context; MmRgnRidRange *rid_range; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(region); ar_assert(context->mm_used_rids); // See if region belongs to our scheduler subtree. We have to check that, // because direction of search initially begins upwards (from child // scheduler to parent scheduler) but later on, if it passes the top-level, // it becomes downwards (from parent scheduler to child scheduler) until it // reaches the correct scheduler. kt_trie_find_approx(context->mm_used_rids, 0, region, (void *) &rid_range); if (rid_range) { // We found something, but does it contain this region ID? ar_assert(region >= rid_range->rid); if (region >= rid_range->rid + rid_range->num_rids) { rid_range = NULL; } } // We know nothing about it if (!rid_range) { // Are we the top-level scheduler? if (context->pr_parent_sched_id == -1) { // No such region exists anywhere in the system return ERR_NO_SUCH_REGION; } // Otherwise, our parent scheduler should know more else { return context->pr_parent_sched_id; } } // Region belongs to our subtree else { // Return appropriate child scheduler return rid_range->sched_id; } }
// =========================================================================== // mm_distr_balloc() Requests a new bulk-objects allocation in a // remote region from the appropriate scheduler // =========================================================================== // * INPUTS // size_t size Size of the new object // rid_t region Remote region ID of the new object // int num_elements Number of objects to bulk-allocate // // * RETURN VALUE // int Message ID of the new request on success // ERR_NO_SUCH_REGION: failure, we are sure // that this region ID does not exist in the // whole system // =========================================================================== int mm_distr_balloc(size_t size, rid_t region, int num_elements) { Context *context; int sched_id; PrMsgReq *new_req; int id; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(size); ar_assert(region); ar_assert(num_elements); // Find which scheduler should we ask for this region sched_id = mm_distr_region_sched_id(region); if (sched_id == ERR_NO_SUCH_REGION) { // No such region exists anywhere in the system return ERR_NO_SUCH_REGION; } // It can't be ours, local functions should have picked it up ar_assert(sched_id != context->pr_scheduler_id); // Build message new_req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id)); new_req->core_id = context->pr_core_id; new_req->req_id = context->pr_message_id; new_req->type = REQ_BALLOC; new_req->size = size; new_req->region = region; new_req->ptr = (void *) ((size_t) num_elements); // Send message to the selected scheduler ar_assert(!noc_msg_send()); // Increase message ID id = context->pr_message_id; context->pr_message_id = pr_advance_msg_id(context->pr_message_id); // Success return id; }
// =========================================================================== // mm_distr_rfree_update_parent() Requests the updating of a remote parent // as to the loss of one of its children // =========================================================================== // * INPUTS // rid_t parent Remote region ID of parent // rid_t child Region ID of the child that is deleted // // * RETURN VALUE // int Message ID of the new request on success // =========================================================================== int mm_distr_rfree_update_parent(rid_t parent, rid_t child) { Context *context; int sched_id; PrMsgReq *new_req; int id; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(parent); ar_assert(child); // Find which scheduler should we ask for the parent sched_id = mm_distr_region_sched_id(parent); if (sched_id == ERR_NO_SUCH_REGION) { // Corruption, we should have a valid parent ID ar_abort(); } // It can't be ours, local functions should have picked it up ar_assert(sched_id != context->pr_scheduler_id); // Build message new_req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id)); new_req->core_id = context->pr_core_id; new_req->req_id = context->pr_message_id; new_req->type = REQ_RFREE_UPDATE_PARENT; new_req->region = parent; new_req->ptr = (void *) child; // Send message to the selected scheduler ar_assert(!noc_msg_send()); // Increase message ID id = context->pr_message_id; context->pr_message_id = pr_advance_msg_id(context->pr_message_id); // Success return id; }
// =========================================================================== // mm_distr_free() Requests a new bulk-objects allocation in a // remote region from the appropriate scheduler // =========================================================================== // * INPUTS // void *ptr The remote object that must be freed // // * RETURN VALUE // int Message ID of the new request on success // ERR_OUT_OF_RANGE: failure, we are sure // that this pointer is not handled by anyone // in the whole system // =========================================================================== int mm_distr_free(void *ptr) { Context *context; int sched_id; PrMsgReq *new_req; int id; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(ptr); // Find out which scheduler should we ask for this object sched_id = mm_distr_object_sched_id(ptr); if (sched_id == ERR_OUT_OF_RANGE) { // No such object exists anywhere in the system return ERR_OUT_OF_RANGE; } // It can't be ours, local functions should have picked it up ar_assert(sched_id != context->pr_scheduler_id); // Build message new_req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id)); new_req->core_id = context->pr_core_id; new_req->req_id = context->pr_message_id; new_req->type = REQ_FREE; new_req->ptr = ptr; // Send message to the selected scheduler ar_assert(!noc_msg_send()); // Increase message ID id = context->pr_message_id; context->pr_message_id = pr_advance_msg_id(context->pr_message_id); // Success return id; }
// =========================================================================== // mm_distr_get_rids() Requests more free region IDs from our parent // scheduler // =========================================================================== // * INPUTS // int num_rids Number of region IDs we need // // * RETURN VALUE // int Message ID of the new request on success // ERR_OUT_OF_MEMORY: failure, no more memory // in the whole system // =========================================================================== int mm_distr_get_rids(int num_rids) { Context *context; PrMsgReq *new_req; int id; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(num_rids); // Are we the top-level scheduler? if (context->pr_parent_sched_id == -1) { // No more region IDs in the whole system return ERR_OUT_OF_RIDS; } // Build message new_req = noc_msg_send_get_buf(pr_scheduler_core_id( context->pr_parent_sched_id)); new_req->core_id = context->pr_core_id; new_req->req_id = context->pr_message_id; new_req->type = REQ_GET_RIDS; new_req->size = num_rids; // Send message to parent ar_assert(!noc_msg_send()); // Increase message ID id = context->pr_message_id; context->pr_message_id = pr_advance_msg_id(context->pr_message_id); // Success return id; }
PUBLIC char * ar_typeDeclarator(Type t) { char* buf; TypeDescription d; Type eType; list dimList; int nDims; int i; t = ar_assert(t); d = t->description; eType = d->structuredDes.array.type; dimList = d->structuredDes.array.dimensions; nDims = list_size(dimList); buf = typeDeclarator(eType); for (i = 0; i < nDims; i++) { buf = cat2(buf, "[]"); } return buf; }
// =========================================================================== // function() FIXME comments // =========================================================================== // * INPUTS // unsigned char *arg1 Describe arg1 // int arg2 Describe arg2 // // * OUTPUTS // int *arg3 Describe arg3 // // * RETURN VALUE // int 0 for success // =========================================================================== void dbg_stats_init() { #ifdef DBG_STATS_ENABLED Context *context; unsigned int i; // Get context context = mm_get_context(ar_get_core_id()); // Allocate array ar_assert(!context->dbg_stats_data); context->dbg_stats_data = kt_malloc(DBG_STATS_NUM_STATS * sizeof(int)); // Zero them all for (i = 0; i < DBG_STATS_NUM_STATS; i++) { context->dbg_stats_data[i] = 0; } // Init the "last timer" value to something meaningful context->dbg_stats_last_tmr = ar_free_timer_get_ticks(); #endif }
// =========================================================================== // kt_mem_init() Initializes the kernel memory allocator, // bootstraps it, creates the kernel slab pool // and allocates/initalizes the global Context. // =========================================================================== void kt_mem_init() { Context *context; int my_cid; size_t kernel_base; size_t kernel_end; int i; // Get core ID and kernel limits my_cid = ar_get_core_id(); kernel_base = mm_va_kernel_base(my_cid); kernel_end = kernel_base + MM_KERNEL_SIZE - 1024 * 1024; // Prepare bootstrapping structures on 2 last kernel heap pages. We are // going to keep track of how many objects we allocate per slab slot size, // to a maximum of MM_BOOTSTRAP_MAX_SLOT, as well as which of these // objects are freed during bootstrap. ar_assert(MM_BOOTSTRAP_MAX_SLOT % MM_ALLOC_ALIGN == 0); for (i = 0; i < (MM_BOOTSTRAP_MAX_SLOT / MM_ALLOC_ALIGN); i++) { // A counter for alloc() calls ((int *) kernel_end)[i] = 0; } // Single counter for free() calls *((int *) (kernel_end - MM_PAGE_SIZE)) = 0; // Make sure we have enough space from the maximum number of objects during // bootstrap, so that they do not reach the two last kernel space pages // which are used for the counters above ar_assert((MM_BOOTSTRAP_MAX_SLOT / MM_ALLOC_ALIGN) * MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE < // max bootstrapped adr kernel_end - MM_PAGE_SIZE); // free() counter // Prepare context context = mm_get_context(my_cid); context->dbg_trc_data = NULL; context->dbg_trc_idx = 0; context->dbg_trc_time_start = 0; context->dbg_trc_offset = 0; context->dbg_stats_data = NULL; context->dbg_stats_last_tmr = 0; context->noc_mode = -1; context->noc_cnt_free = NULL; context->noc_send_buf = NULL; context->noc_recv_buf = NULL; context->noc_msg_core_id = -1; context->noc_credits = NULL; context->noc_credits_poll = NULL; context->noc_num_peers = 0; context->noc_poll_rr = 0; context->noc_active_dmas = NULL; #ifdef NOC_WARN_OUT_OF_CREDITS context->noc_cred_warned = 0; #endif context->mm_alloc_bootstrap = 1; context->mm_frees_bootstrap = 1; context->mm_kernel_pool = NULL; context->mm_recursion_depth = 0; for (i = 0; i < MM_SLAB_SIZE / MM_ALLOC_ALIGN; i++) { context->mm_prealloc_flags[i] = 0; } context->mm_prealloc_needed = 0; context->mm_busy_freeing = 0; context->mm_defer_frees = NULL; context->mm_num_defer_frees = 0; context->mm_region_tree = NULL; context->mm_used_rids = NULL; context->mm_free_rids = NULL; context->mm_used_ranges = NULL; context->mm_free_ranges = NULL; context->mm_free_num_slabs = 0; context->mm_free_num_rids = 0; context->mm_local_rids = NULL; context->mm_range_chunk = MM_ADR_RANGE_CHUNK_MAX; context->mm_last_harvest = 0; context->mm_load_rrobin = 0; context->mm_current_load = 0; context->mm_reported_load = 0; context->mm_children_load = NULL; context->pr_core_id = -1; context->pr_num_cores = -1; context->pr_role = -1; context->pr_core_bid_cid = NULL; context->pr_core_work_ids = NULL; context->pr_core_sched_ids = NULL; context->pr_core_child_idx = NULL; context->pr_work_core_ids = NULL; context->pr_sched_core_ids = NULL; context->pr_core_route = NULL; context->pr_num_schedulers = -1; context->pr_num_workers = -1; context->pr_worker_id = -1; context->pr_scheduler_id = -1; context->pr_parent_sched_id = -1; context->pr_scheduler_level = -1; context->pr_children = NULL; context->pr_num_children = -1; context->pr_cur_epoch = 0; context->pr_task_table = NULL; context->pr_ready_queue = NULL; context->pr_spawn_pending = 0; context->pr_tasks = NULL; context->pr_avail_task_id = 1; context->pr_sched_rr = 0; context->pr_load_vs_locality = PR_LOAD_VS_LOCALITY_DEFAULT; context->pr_cur_sched_load = 0; context->pr_cur_run_load = 0; context->pr_rep_sched_load = 0; context->pr_rep_run_load = 0; context->pr_chld_sched_load = NULL; context->pr_chld_run_load = NULL; context->pr_main_finished = 0; context->pr_message_id = 1; context->pr_pending_events = NULL; context->pr_incomplete_req = NULL; context->pr_pages_msg_id = 0; context->pr_rids_msg_id = 0; context->vid_demo_in_bid = -1; context->vid_demo_in_cid = -1; context->vid_demo_out_bid = -1; context->vid_demo_out_cid = -1; #if 0 context->sys_sched_sec = 0; context->sys_sched_usec = 0; context->sys_sched_calls = 0; context->sys_worker_sec = 0; context->sys_worker_usec = 0; context->sys_worker_sent = 0; context->sys_worker_recv = 0; context->sys_barrier_sec = 0; context->sys_barrier_usec = 0; #endif #ifdef ARCH_ARM context->fs_flash_num_sectors = 0; context->fs_num_blocks = 0; context->fs_max_inodes = 0; context->fs_state = NULL; context->fs_fdesc = NULL; #endif #ifdef FMPI context->fmpi = NULL; #endif // Context is at a predefined location. Make sure the bootstrapping structs // know about it, so the space can be accounted for later. This is the // first malloc we ever call, so no other object can take this predefined // location (which mm_get_context() has already returned above). ar_assert(kt_malloc(sizeof(Context)) == context); // Bootstrap memory system and create kernel memory pool ar_assert((MM_ALLOC_ALIGN & (MM_ALLOC_ALIGN - 1)) == 0); ar_assert(MM_PAGE_SIZE % MM_SLAB_SIZE == 0); ar_assert(MM_SLAB_SIZE % MM_ALLOC_ALIGN == 0); ar_assert(kernel_base % MM_SLAB_SIZE == 0); ar_assert(kernel_end % MM_SLAB_SIZE == 0); // Create kernel pool. The function will handle the rest of the // bootstrapping process, end the bootstrap mode and fill // context->mm_kernel_pool with the kernel pool. mm_slab_create_pool(NULL, kernel_base, MM_KERNEL_SIZE / MM_SLAB_SIZE, 1); ar_assert(!context->mm_alloc_bootstrap); ar_assert(!context->mm_frees_bootstrap); }
// =========================================================================== // =========================================================================== void test_mpi_pipeline(int rank, int num_cores, int packet_size, int num_packets) { volatile int *buf; MPI_Status st; int i; int j; // Sanity checks ar_assert(num_cores > 2); // Allocate buffer buf = kt_malloc(packet_size); // Producer if (rank == 0) { kt_printf("%d: MPI pipeline starts\r\n", rank); for (i = 0; i < num_packets; i++) { for (j = 0; j < packet_size / 4; j++) { buf[j] = rank + i + j; } MPI_Send(buf, packet_size / 4, MPI_INT, rank + 1, 0, MPI_COMM_WORLD); } } // Middleman else if (rank < num_cores - 1) { for (i = 0; i < num_packets; i++) { MPI_Recv(buf, packet_size / 4, MPI_INT, rank - 1, 0, MPI_COMM_WORLD, &st); for (j = 0; j < packet_size / 4; j++) { buf[j] += rank; } MPI_Send(buf, packet_size / 4, MPI_INT, rank + 1, 0, MPI_COMM_WORLD); } } // Consumer else { for (i = 0; i < num_packets; i++) { MPI_Recv(buf, packet_size / 4, MPI_INT, rank - 1, 0, MPI_COMM_WORLD, &st); for (j = 0; j < packet_size / 4; j++) { if (buf[j] != (num_cores - 1) * (num_cores - 2) / 2 + i + j) { kt_printf("%d: rep %d, buf[%d] = %d [ %s ]\r\n", rank, i, j, buf[j], TEST_FAIL); while (1) { ; } } } } kt_printf("\n%d: Received %d * %d bytes [ %s ]\r\n", rank, num_packets, packet_size, TEST_PASS); } // Free buffer kt_free((void *) buf); }
PRIVATE char * arrShortName(Type t) { t = ar_assert(t); return easyShortTypeNameCleaned(t); }
PRIVATE char * arrJName(Type t) { t = ar_assert(t); return packageDotStringJ(getContextT(t), arrShortName(t)); }
// =========================================================================== // kt_realloc() Reallocates an object to a new size // =========================================================================== // * INPUTS // void *old_ptr The old object to be reallocated // size_t new_size Number of bytes for the new object // // * RETURN VALUE // void * Pointer to new allocated object. Note that // NULL must not be returned -- out of memory // in kernel space should trigger an abort. // =========================================================================== void *kt_realloc(void *old_ptr, size_t new_size) { int old_size; void *new_ptr; Context *context; // Query old pointer. We accept NULL pointers in realloc. context = mm_get_context(ar_get_core_id()); if (old_ptr) { old_size = mm_slab_query_pointer(context->mm_kernel_pool, (size_t) old_ptr); ar_assert(old_size > 0); } else { old_size = 0; } // Clamp requests up to 2-GB size (see kt_malloc above). ar_assert (new_size < (1 << 31)); // Align size request to nearest allowed size if (new_size & (MM_ALLOC_ALIGN - 1)) { new_size = (new_size & ~(MM_ALLOC_ALIGN - 1)) + MM_ALLOC_ALIGN; } // It may happen that the same size is requested, e.g. by asking for // 4, then 8, then 12, ..., but alignment forces all of them to be 64 bytes. // If so, do nothing. if (old_size == new_size) { return old_ptr; } // Underlying slab allocator will never* return the same pointer for a // different slot size. Allocate a new pointer before freeing the old, // so that we avoid Tries or other allocations taking its position before // we manage to copy its contents. // // *: correct is "almost never". Definitely true for < slab_size requests // where not enough empty slabs are preallocated. For other cases, it can // happen that if we freed the pointer before the new malloc, there is a // chance that will happen. For kernel objects, which are small and // dominated by mallocs, not frees, it is nearly impossible as a case. We // choose not to follow it, because it can seriously mess stuff up if an // intermediate malloc (e.g. a Trie kt_malloc) takes up this space and // overwrites the data before we finish here -- and there's no easy way // of knowing that. if (new_size) { ar_assert(new_ptr = kt_malloc(new_size)); } else { new_ptr = NULL; } // Copy contents if (new_ptr && old_size) { kt_memcpy(new_ptr, old_ptr, (old_size < new_size) ? old_size : new_size); } // Free old pointer if (old_ptr) { kt_free(old_ptr); } // Success return new_ptr; }
// =========================================================================== // =========================================================================== int stream_bare(int n) { float *a, *b, *c; int avgtime[4] = {0}, maxtime[4] = {0}, mintime[4] = {0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF}; char *label[4] = {"Copy: ", "Scale: ", "Add: ", "Triad: "}; float bytes[4] = { 2 * sizeof(float) * n, 2 * sizeof(float) * n, 3 * sizeof(float) * n, 3 * sizeof(float) * n }; int BytesPerWord; register int j, k; float scalar; int t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ kt_printf(HLINE); kt_printf("STREAM version $Revision: 1.1 $\n"); kt_printf(HLINE); // Assign arrays into kernel heap space (too big to fit in stack) a = (float *) mm_va_kernel_base(ar_get_core_id()); b = a + n; c = b + n; ar_assert((unsigned int) ((c + n) - a) < MM_KERNEL_SIZE); BytesPerWord = sizeof(float); //kt_printf("This system uses %d bytes per SINGLE PRECISION word.\n", // BytesPerWord); //kt_printf(HLINE); kt_printf("Array size = %d, Offset = 0, total memory req. = %.2f KB.\n", n, (3.0 * BytesPerWord * (float) n) / 1024.0); //kt_printf("Each test is run %d times, but only\n", NTIMES); //kt_printf("the *best* time for each is used.\n"); //kt_printf(HLINE); /* Get initial value for system clock. */ for (j=0; j<n; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } //if ( (quantum = checktick()) >= 1) // kt_printf("Your clock granularity/precision appears to be " // "%d microseconds.\n", quantum); //else { // kt_printf("Your clock granularity appears to be " // "less than one microsecond.\n"); // quantum = 1; //} ar_timer_reset(); for (j = 0; j < n; j++) a[j] = 2.0E0 * a[j]; t = ar_timer_get_cycles(); //kt_printf("Each test below will take on the order" // " of %d clock cycles.\n", t ); //kt_printf("Increase the size of the arrays if this shows that\n"); //kt_printf("you are not getting at least 20 clock cycles per test.\n"); kt_printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { ar_timer_reset(); for (j=0; j<n; j++) c[j] = a[j]; times[0][k] = ar_timer_get_cycles(); ar_timer_reset(); for (j=0; j<n; j++) b[j] = scalar*c[j]; times[1][k] = ar_timer_get_cycles(); ar_timer_reset(); for (j=0; j<n; j++) c[j] = a[j]+b[j]; times[2][k] = ar_timer_get_cycles(); ar_timer_reset(); for (j=0; j<n; j++) a[j] = b[j]+scalar*c[j]; times[3][k] = ar_timer_get_cycles(); } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } kt_printf("Function Rate (B/cc) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(float)(NTIMES-1); kt_printf("%s%11.4f %13d %11d %11d\n", label[j], (float) bytes[j] / (float) mintime[j], avgtime[j], mintime[j], maxtime[j]); } kt_printf(HLINE); /* --- Check Results --- */ stream_check_results(a, b, c, n); //kt_printf(HLINE); return 0; }
// =========================================================================== // mm_distr_pack() Create multiple packing requests to // appropriate schedulers for packing multiple // objects and regions. // // The regions and objects given to this function // are checked one by one as to which scheduler // should be responsible for them. Before any // messages are sent, per-scheduler arrays are // created to gather all these decisions. When // this sorting process is finished, request(s) // to each scheduler are made, as many as needed // depending on the number of regions/objects // that are needed by each scheduler. // // Because many messages to many schedulers are // sent, an array of new message IDs is retured. // =========================================================================== // * INPUTS // rid_t *regions Array of remote region IDs to be packed // int *region_options Array of remote region packing options // int num_regions Number of regions in array // void **objects Array of remote objects to be packed // int *object_options Array of remote object packing options // int num_objects Number of objects in array // // * OUTPUTS // int **ret_msg_ids Array of returned message IDs, one for each // message sent // int *ret_num_messages Number of messages generated // void **ret_error_ptr In case of error, region ID or object that // triggered it // // * RETURN VALUE // int 0: success, *ret_num_messages generated // ERR_NO_SUCH_REGION: failure, *ret_error_ptr // region ID is invalid in the whole system. // No messages generated. // ERR_OUT_OF_RANGE: failure, *ret_error_ptr // object is invalid in the whole system. // No messages generated. // =========================================================================== int mm_distr_pack(rid_t *regions, int *region_options, int num_regions, void **objects, int *object_options, int num_objects, int **ret_msg_ids, int *ret_num_messages, void **ret_error_ptr) { typedef struct { rid_t *regions; int *region_options; int num_regions; void **objects; int *object_options; int num_objects; } pack_per_sched_type; Context *context; PrMsgReq *req; Trie *trie; pack_per_sched_type *per_sched; int sched_id; int error_status; int cur_reg; int cur_obj; int i; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(ret_msg_ids); ar_assert(ret_num_messages); ar_assert(ret_error_ptr); // Initialize return variables *ret_msg_ids = NULL; *ret_num_messages = 0; *ret_error_ptr = NULL; // Create a trie to store per-scheduler packing needs. We'll be using IDs // from 1 to context->pr_num_schedulers, so the MSB is the log2 of // context->pr_num_schedulers. The kt_int_log2() function will return the // MSB position correctly, even for non-power-of-2 values. ar_assert(trie = kt_alloc_trie(kt_int_log2(context->pr_num_schedulers), 0)); // For all regions for (i = 0; i < num_regions; i++) { // Find which scheduler should we ask for this region sched_id = mm_distr_region_sched_id(regions[i]); if (sched_id == ERR_NO_SUCH_REGION) { // No such region exists anywhere in the system. Abort. *ret_error_ptr = (void *) regions[i]; error_status = ERR_NO_SUCH_REGION; goto error; } // It can't be ours, local functions should have picked it up ar_assert(sched_id != context->pr_scheduler_id); // We use scheduler ID + 1, because tries cannot handle zero keys sched_id++; // Do we have anything else for this scheduler ID? kt_trie_find(trie, sched_id, (void *) &per_sched); // Append this region to existing entry if (per_sched) { per_sched->regions = kt_realloc(per_sched->regions, (per_sched->num_regions + 1) * sizeof(rid_t)); per_sched->region_options = kt_realloc(per_sched->region_options, (per_sched->num_regions + 1) * sizeof(int)); per_sched->regions[per_sched->num_regions] = regions[i]; per_sched->region_options[per_sched->num_regions] = region_options[i]; per_sched->num_regions++; } // Create new entry for this scheduler else { per_sched = kt_malloc(sizeof(pack_per_sched_type)); per_sched->regions = kt_malloc(sizeof(rid_t)); per_sched->region_options = kt_malloc(sizeof(int)); per_sched->regions[0] = regions[i]; per_sched->region_options[0] = region_options[i]; per_sched->num_regions = 1; per_sched->objects = NULL; per_sched->object_options = NULL; per_sched->num_objects = 0; // Insert it into the trie ar_assert(!kt_trie_insert(trie, sched_id, per_sched)); } } // For all objects for (i = 0; i < num_objects; i++) { // Find which scheduler should we ask for this object sched_id = mm_distr_object_sched_id(objects[i]); if (sched_id == ERR_OUT_OF_RANGE) { // No such object exists anywhere in the system. Abort. *ret_error_ptr = objects[i]; error_status = ERR_OUT_OF_RANGE; goto error; } // It can't be ours, local functions should have picked it up ar_assert(sched_id != context->pr_scheduler_id); // We use scheduler ID + 1, because tries cannot handle zero keys sched_id++; // Do we have anything else for this scheduler ID? kt_trie_find(trie, sched_id, (void *) &per_sched); // Append this object to existing entry if (per_sched) { per_sched->objects = kt_realloc(per_sched->objects, (per_sched->num_objects + 1) * sizeof(void *)); per_sched->object_options = kt_realloc(per_sched->object_options, (per_sched->num_objects + 1) * sizeof(int)); per_sched->objects[per_sched->num_objects] = objects[i]; per_sched->object_options[per_sched->num_objects] = object_options[i]; per_sched->num_objects++; } // Create new entry for this scheduler else { per_sched = kt_malloc(sizeof(pack_per_sched_type)); per_sched->objects = kt_malloc(sizeof(void *)); per_sched->object_options = kt_malloc(sizeof(int)); per_sched->objects[0] = objects[i]; per_sched->object_options[0] = object_options[i]; per_sched->num_objects = 1; per_sched->regions = NULL; per_sched->region_options = NULL; per_sched->num_regions = 0; // Insert it into the trie ar_assert(!kt_trie_insert(trie, sched_id, per_sched)); } } // For all schedulers that we need to communicate with for (sched_id = kt_trie_find_minmax(trie, 0, (void *) &per_sched); sched_id; sched_id = kt_trie_find_next(trie, 1, (void *) &per_sched)) { // Restore correct scheduler ID value sched_id--; // Initialize cur_reg = 0; cur_obj = 0; req = NULL; // Loop until all regions and objects have been sent while ((cur_reg < per_sched->num_regions) || (cur_obj < per_sched->num_objects)) { // Start building new message req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id)); req->core_id = context->pr_core_id; req->req_id = context->pr_message_id; req->size = 0; // Append this message ID to the array of IDs we'll return ar_assert(*ret_msg_ids = kt_realloc(*ret_msg_ids, (*ret_num_messages + 1) * sizeof(int))); (*ret_msg_ids)[*ret_num_messages] = req->req_id; (*ret_num_messages)++; // Increase message ID, avoiding value 0 on wrap-arounds context->pr_message_id = pr_advance_msg_id(context->pr_message_id); // Embed first remaining region and first remaining object to basic // request if (cur_reg < per_sched->num_regions) { ar_assert(per_sched->regions[cur_reg]); req->region = per_sched->regions[cur_reg]; req->size |= per_sched->region_options[cur_reg] << 0; cur_reg++; } else { req->region = 0; } if (cur_obj < per_sched->num_objects) { ar_assert(per_sched->objects[cur_obj]); req->ptr = per_sched->objects[cur_obj]; req->size |= per_sched->object_options[cur_obj] << MM_PACK_OPTION_BITS; cur_obj++; } else { req->ptr = NULL; } // Did we fit or do we need an extended request? if ((cur_reg >= per_sched->num_regions) && (cur_obj >= per_sched->num_objects)) { req->type = REQ_PACK; // Send basic request only to scheduler ar_assert(!noc_msg_send()); req = NULL; } else { req->type = EXT_REQ_PACK; // Put regions in first slots of extended array for (req->num_regions = 0; (req->num_regions < PR_REQ_MAX_SIZE) && (cur_reg < per_sched->num_regions); req->num_regions++, cur_reg++) { ar_assert(per_sched->regions[cur_reg]); req->data[req->num_regions] = (void *) per_sched->regions[cur_reg]; req->size |= per_sched->region_options[cur_reg] << ((2 + req->num_regions) * MM_PACK_OPTION_BITS); } // Put objects on following slots, if regions are finished for (req->num_ptrs = 0; (req->num_regions + req->num_ptrs < PR_REQ_MAX_SIZE) && (cur_obj < per_sched->num_objects); req->num_ptrs++, cur_obj++) { ar_assert(per_sched->objects[cur_obj]); req->data[req->num_regions + req->num_ptrs] = per_sched->objects[cur_obj]; req->size |= per_sched->object_options[cur_obj] << ((2 + req->num_regions + req->num_ptrs) * MM_PACK_OPTION_BITS); } // Send extended request to scheduler ar_assert(!noc_msg_send()); req = NULL; } } // Free this entry kt_free(per_sched->regions); kt_free(per_sched->region_options); kt_free(per_sched->objects); kt_free(per_sched->object_options); kt_free(per_sched); } // Free the trie kt_free_trie(trie, NULL); // Success return 0; error: // Free all allocated per-scheduler entries for (sched_id = kt_trie_find_minmax(trie, 0, (void *) &per_sched); sched_id; sched_id = kt_trie_find_next(trie, 1, (void *) &per_sched)) { kt_free(per_sched->regions); kt_free(per_sched->region_options); kt_free(per_sched->objects); kt_free(per_sched->object_options); kt_free(per_sched); } // Free the trie kt_free_trie(trie, NULL); // Return error code; error_ptr is set above. return error_status; }
// =========================================================================== // mm_distr_ralloc_orphan() Requests a new region creation under a // remote parent region from the appropriate // scheduler, and tells him that it's an orphan // for him, i.e. with a remote region ID. // =========================================================================== // * INPUTS // rid_t parent Region ID of the remote parent region // int src_core_id Core ID of the scheduler that did the request, // so we can take it into account when deciding // which scheduler should own the new orphan // // * RETURN VALUE // int Message ID of the new request // =========================================================================== int mm_distr_ralloc_orphan(rid_t parent, int src_core_id) { Context *context; int src_sched_id; MmRgnTreeNode *parent_node; int child_id; PrMsgReq *new_req; int id; int i; // Sanity checks context = mm_get_context(ar_get_core_id()); ar_assert(parent); ar_assert(context->pr_scheduler_level); ar_assert(context->pr_num_children); src_sched_id = pr_core_scheduler_id(src_core_id); ar_assert((src_sched_id >= 0) && (src_sched_id < context->pr_num_schedulers)); // The parent region should belong to us. Find the region node. ar_assert(kt_trie_find(context->mm_local_rids, parent, (void *) &parent_node)); // In the stand-alone allocator version, we only load-balance if requests // come from "above" (i.e. the request from a worker reaches the top-level // scheduler and starts to descend). In the Myrmics version, there is // no such case: a task is allowed to see only subtrees of objects/regions // it owns, so requests always come from "below". Thus, we always // load-balance. #if 0 // Request comes from one of our children if (src_sched_id != context->pr_parent_sched_id) { // Verify he's one of our children child_id = -1; for (i = 0; i < context->pr_num_children; i++) { if (context->pr_children[i] == src_core_id) { child_id = i; break; } } ar_assert(child_id > -1); } // Request comes from our parent else { #endif // Search for least load, favoring the lastly decided round robin // position, in case there's a draw child_id = context->mm_load_rrobin; for (i = 0; i < context->pr_num_children; i++) { if (context->mm_children_load[i] < context->mm_children_load[child_id]) { child_id = i; } } // New round robin position is the next child from the one we decided context->mm_load_rrobin = child_id + 1; if (context->mm_load_rrobin >= context->pr_num_children) { context->mm_load_rrobin = 0; } #if 0 } #endif // Build message new_req = noc_msg_send_get_buf(context->pr_children[child_id]); new_req->core_id = context->pr_core_id; new_req->req_id = context->pr_message_id; new_req->type = REQ_RALLOC_ORPHAN; new_req->region = parent; new_req->size = parent_node->location; // Send message to the selected scheduler ar_assert(!noc_msg_send()); // Increase message ID id = context->pr_message_id; context->pr_message_id = pr_advance_msg_id(context->pr_message_id); // Success return id; }
// =========================================================================== // function() FIXME comments // =========================================================================== // * INPUTS // unsigned char *arg1 Describe arg1 // int arg2 Describe arg2 // // * OUTPUTS // int *arg3 Describe arg3 // // * RETURN VALUE // int 0 for success // =========================================================================== void dbg_stats_report(char *filename) { #ifdef DBG_STATS_ENABLED #define SUMMARY_BUF_SIZE 8196 Context *context; int my_bid; int my_cid; int bid; int cid; unsigned int **stats = NULL; char *buf = NULL; char fmt_buf[16]; char *s; unsigned int i; unsigned int j; unsigned int sch_idle_tot = 0; unsigned int sch_idle_avg = 0; int sch_idle_avg_dec = 0; int sch_idle_avg_frac = 0; char sch_idle_avg_units = 0; unsigned int sch_idle_min = UINT_MAX; int sch_idle_min_idx = -1; int sch_idle_min_dec = 0; int sch_idle_min_frac = 0; char sch_idle_min_units = ' '; unsigned int sch_idle_max = 0; int sch_idle_max_idx = -1; int sch_idle_max_dec = 0; int sch_idle_max_frac = 0; char sch_idle_max_units = 0; unsigned int wrk_idle_tot = 0; unsigned int wrk_idle_avg = 0; int wrk_idle_avg_dec = 0; int wrk_idle_avg_frac = 0; char wrk_idle_avg_units = 0; unsigned int wrk_idle_min = UINT_MAX; int wrk_idle_min_idx = -1; int wrk_idle_min_dec = 0; int wrk_idle_min_frac = 0; char wrk_idle_min_units = ' '; unsigned int wrk_idle_max = 0; int wrk_idle_max_idx = -1; int wrk_idle_max_dec = 0; int wrk_idle_max_frac = 0; char wrk_idle_max_units = 0; unsigned int sch_mem_tot = 0; unsigned int sch_mem_avg = 0; int sch_mem_avg_dec = 0; int sch_mem_avg_frac = 0; char sch_mem_avg_units = 0; unsigned int sch_mem_min = UINT_MAX; int sch_mem_min_idx = -1; int sch_mem_min_dec = 0; int sch_mem_min_frac = 0; char sch_mem_min_units = ' '; unsigned int sch_mem_max = 0; int sch_mem_max_idx = -1; int sch_mem_max_dec = 0; int sch_mem_max_frac = 0; char sch_mem_max_units = 0; unsigned int wrk_work_tot = 0; unsigned int wrk_work_avg = 0; int wrk_work_avg_dec = 0; int wrk_work_avg_frac = 0; char wrk_work_avg_units = 0; unsigned int wrk_work_min = UINT_MAX; int wrk_work_min_idx = -1; int wrk_work_min_dec = 0; int wrk_work_min_frac = 0; char wrk_work_min_units = ' '; unsigned int wrk_work_max = 0; int wrk_work_max_idx = -1; int wrk_work_max_dec = 0; int wrk_work_max_frac = 0; char wrk_work_max_units = 0; unsigned int sch_sched_tot = 0; unsigned int sch_sched_avg = 0; int sch_sched_avg_dec = 0; int sch_sched_avg_frac = 0; char sch_sched_avg_units = 0; unsigned int sch_sched_min = UINT_MAX; int sch_sched_min_idx = -1; int sch_sched_min_dec = 0; int sch_sched_min_frac = 0; char sch_sched_min_units = ' '; unsigned int sch_sched_max = 0; int sch_sched_max_idx = -1; int sch_sched_max_dec = 0; int sch_sched_max_frac = 0; char sch_sched_max_units = 0; unsigned int wrk_wait_tot = 0; unsigned int wrk_wait_avg = 0; int wrk_wait_avg_dec = 0; int wrk_wait_avg_frac = 0; char wrk_wait_avg_units = 0; unsigned int wrk_wait_min = UINT_MAX; int wrk_wait_min_idx = -1; int wrk_wait_min_dec = 0; int wrk_wait_min_frac = 0; char wrk_wait_min_units = ' '; unsigned int wrk_wait_max = 0; int wrk_wait_max_idx = -1; int wrk_wait_max_dec = 0; int wrk_wait_max_frac = 0; char wrk_wait_max_units = 0; unsigned int sch_tasks_tot = 0; unsigned int sch_tasks_avg = 0; int sch_tasks_avg_dec = 0; int sch_tasks_avg_frac = 0; char sch_tasks_avg_units = 0; unsigned int sch_tasks_min = UINT_MAX; int sch_tasks_min_idx = -1; int sch_tasks_min_dec = 0; int sch_tasks_min_frac = 0; char sch_tasks_min_units = ' '; unsigned int sch_tasks_max = 0; int sch_tasks_max_idx = -1; int sch_tasks_max_dec = 0; int sch_tasks_max_frac = 0; char sch_tasks_max_units = 0; unsigned int wrk_tasks_tot = 0; unsigned int wrk_tasks_avg = 0; int wrk_tasks_avg_dec = 0; int wrk_tasks_avg_frac = 0; char wrk_tasks_avg_units = 0; unsigned int wrk_tasks_min = UINT_MAX; int wrk_tasks_min_idx = -1; int wrk_tasks_min_dec = 0; int wrk_tasks_min_frac = 0; char wrk_tasks_min_units = ' '; unsigned int wrk_tasks_max = 0; int wrk_tasks_max_idx = -1; int wrk_tasks_max_dec = 0; int wrk_tasks_max_frac = 0; char wrk_tasks_max_units = 0; unsigned int sch_msg_tot = 0; unsigned int sch_msg_avg = 0; int sch_msg_avg_dec = 0; int sch_msg_avg_frac = 0; char sch_msg_avg_units = 0; unsigned int sch_msg_min = UINT_MAX; int sch_msg_min_idx = -1; int sch_msg_min_dec = 0; int sch_msg_min_frac = 0; char sch_msg_min_units = ' '; unsigned int sch_msg_max = 0; int sch_msg_max_idx = -1; int sch_msg_max_dec = 0; int sch_msg_max_frac = 0; char sch_msg_max_units = 0; unsigned int wrk_msg_tot = 0; unsigned int wrk_msg_avg = 0; int wrk_msg_avg_dec = 0; int wrk_msg_avg_frac = 0; char wrk_msg_avg_units = 0; unsigned int wrk_msg_min = UINT_MAX; int wrk_msg_min_idx = -1; int wrk_msg_min_dec = 0; int wrk_msg_min_frac = 0; char wrk_msg_min_units = ' '; unsigned int wrk_msg_max = 0; int wrk_msg_max_idx = -1; int wrk_msg_max_dec = 0; int wrk_msg_max_frac = 0; char wrk_msg_max_units = 0; unsigned int wrk_ndma_tot = 0; unsigned int wrk_ndma_avg = 0; int wrk_ndma_avg_dec = 0; int wrk_ndma_avg_frac = 0; char wrk_ndma_avg_units = 0; unsigned int wrk_ndma_min = UINT_MAX; int wrk_ndma_min_idx = -1; int wrk_ndma_min_dec = 0; int wrk_ndma_min_frac = 0; char wrk_ndma_min_units = ' '; unsigned int wrk_ndma_max = 0; int wrk_ndma_max_idx = -1; int wrk_ndma_max_dec = 0; int wrk_ndma_max_frac = 0; char wrk_ndma_max_units = 0; unsigned int wrk_sdma_tot = 0; unsigned int wrk_sdma_avg = 0; int wrk_sdma_avg_dec = 0; int wrk_sdma_avg_frac = 0; char wrk_sdma_avg_units = 0; unsigned int wrk_sdma_min = UINT_MAX; int wrk_sdma_min_idx = -1; int wrk_sdma_min_dec = 0; int wrk_sdma_min_frac = 0; char wrk_sdma_min_units = ' '; unsigned int wrk_sdma_max = 0; int wrk_sdma_max_idx = -1; int wrk_sdma_max_dec = 0; int wrk_sdma_max_frac = 0; char wrk_sdma_max_units = 0; // Get context context = mm_get_context(ar_get_core_id()); my_cid = ar_get_core_id(); my_bid = ar_get_board_id(); // ========================================================================= // Gather stats from everybody // ========================================================================= // Top-level scheduler is the master core for reporting if (context->pr_parent_sched_id == -1) { // Allocate space for all stats = kt_malloc(context->pr_num_cores * sizeof(unsigned int *)); for (i = 0; i < context->pr_num_cores; i++) { stats[i] = kt_malloc(DBG_STATS_NUM_STATS * sizeof(unsigned int)); } // For all cores in the setup for (i = 0; i < context->pr_num_cores; i++) { // Get arch-level board/core ID pr_core_arch_bid_cid(i, &bid, &cid); // Is it us? if ((bid == my_bid) && (cid == my_cid)) { // Copy our own stats for (j = 0; j < DBG_STATS_NUM_STATS; j++) { stats[i][j] = context->dbg_stats_data[j]; } continue; } // Handshake with peer and tell him to send us his stats; send our // bid/cid so he can communicate back ar_mbox_send(my_cid, bid, cid, DBG_STATS_MAGIC_HSHAKE1); ar_mbox_send(my_cid, bid, cid, (my_bid << 8) | my_cid); // Sanity check: get start-of-transmission ar_assert(ar_mbox_get(my_cid) == DBG_STATS_MAGIC_HSHAKE2); // Copy his stats for (j = 0; j < DBG_STATS_NUM_STATS; j++) { stats[i][j] = ar_mbox_get(my_cid); } // Sanity check: get end-of-transmission ar_assert(ar_mbox_get(my_cid) == DBG_STATS_MAGIC_HSHAKE3); } } // Other cores are slaves else { // Receive master command and his bid/cid ar_assert(ar_mbox_get(my_cid) == DBG_STATS_MAGIC_HSHAKE1); i = ar_mbox_get(my_cid); bid = i >> 8; cid = i & 0xFF; // Send start-of-transmission ar_mbox_send(my_cid, bid, cid, DBG_STATS_MAGIC_HSHAKE2); // Send our stats for (i = 0; i < DBG_STATS_NUM_STATS; i++) { ar_mbox_send(my_cid, bid, cid, context->dbg_stats_data[i]); } // Send end-of-transmission ar_mbox_send(my_cid, bid, cid, DBG_STATS_MAGIC_HSHAKE3); } // ========================================================================= // Create a summary and print it // ========================================================================= // Top-level scheduler only, everybody else get out if (context->pr_parent_sched_id != -1) { return; } // Scheduler & worker idle time for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_sched_ids[i] > -1) { sch_idle_tot += stats[i][DBG_STATS_IDX_TIME_IDLE]; if (stats[i][DBG_STATS_IDX_TIME_IDLE] < sch_idle_min) { sch_idle_min = stats[i][DBG_STATS_IDX_TIME_IDLE]; sch_idle_min_idx = i; } if (stats[i][DBG_STATS_IDX_TIME_IDLE] > sch_idle_max) { sch_idle_max = stats[i][DBG_STATS_IDX_TIME_IDLE]; sch_idle_max_idx = i; } } else { wrk_idle_tot += stats[i][DBG_STATS_IDX_TIME_IDLE]; if (stats[i][DBG_STATS_IDX_TIME_IDLE] < wrk_idle_min) { wrk_idle_min = stats[i][DBG_STATS_IDX_TIME_IDLE]; wrk_idle_min_idx = i; } if (stats[i][DBG_STATS_IDX_TIME_IDLE] > wrk_idle_max) { wrk_idle_max = stats[i][DBG_STATS_IDX_TIME_IDLE]; wrk_idle_max_idx = i; } } } sch_idle_avg = sch_idle_tot / context->pr_num_schedulers; wrk_idle_avg = wrk_idle_tot / context->pr_num_workers; dbg_stats_format(sch_idle_avg, &sch_idle_avg_dec, &sch_idle_avg_frac, &sch_idle_avg_units); dbg_stats_format(sch_idle_min, &sch_idle_min_dec, &sch_idle_min_frac, &sch_idle_min_units); dbg_stats_format(sch_idle_max, &sch_idle_max_dec, &sch_idle_max_frac, &sch_idle_max_units); dbg_stats_format(wrk_idle_avg, &wrk_idle_avg_dec, &wrk_idle_avg_frac, &wrk_idle_avg_units); dbg_stats_format(wrk_idle_min, &wrk_idle_min_dec, &wrk_idle_min_frac, &wrk_idle_min_units); dbg_stats_format(wrk_idle_max, &wrk_idle_max_dec, &wrk_idle_max_frac, &wrk_idle_max_units); // Scheduler mem time for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_sched_ids[i] > -1) { sch_mem_tot += stats[i][DBG_STATS_IDX_TIME_MEM_SERVE]; if (stats[i][DBG_STATS_IDX_TIME_MEM_SERVE] < sch_mem_min) { sch_mem_min = stats[i][DBG_STATS_IDX_TIME_MEM_SERVE]; sch_mem_min_idx = i; } if (stats[i][DBG_STATS_IDX_TIME_MEM_SERVE] > sch_mem_max) { sch_mem_max = stats[i][DBG_STATS_IDX_TIME_MEM_SERVE]; sch_mem_max_idx = i; } } } sch_mem_avg = sch_mem_tot / context->pr_num_schedulers; dbg_stats_format(sch_mem_avg, &sch_mem_avg_dec, &sch_mem_avg_frac, &sch_mem_avg_units); dbg_stats_format(sch_mem_min, &sch_mem_min_dec, &sch_mem_min_frac, &sch_mem_min_units); dbg_stats_format(sch_mem_max, &sch_mem_max_dec, &sch_mem_max_frac, &sch_mem_max_units); // Worker work time for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_work_ids[i] > -1) { wrk_work_tot += stats[i][DBG_STATS_IDX_TIME_TASK_EXEC]; if (stats[i][DBG_STATS_IDX_TIME_TASK_EXEC] < wrk_work_min) { wrk_work_min = stats[i][DBG_STATS_IDX_TIME_TASK_EXEC]; wrk_work_min_idx = i; } if (stats[i][DBG_STATS_IDX_TIME_TASK_EXEC] > wrk_work_max) { wrk_work_max = stats[i][DBG_STATS_IDX_TIME_TASK_EXEC]; wrk_work_max_idx = i; } } } wrk_work_avg = wrk_work_tot / context->pr_num_workers; dbg_stats_format(wrk_work_avg, &wrk_work_avg_dec, &wrk_work_avg_frac, &wrk_work_avg_units); dbg_stats_format(wrk_work_min, &wrk_work_min_dec, &wrk_work_min_frac, &wrk_work_min_units); dbg_stats_format(wrk_work_max, &wrk_work_max_dec, &wrk_work_max_frac, &wrk_work_max_units); // Scheduler non-mem time for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_sched_ids[i] > -1) { sch_sched_tot += stats[i][DBG_STATS_IDX_TIME_SCH_SERVE]; if (stats[i][DBG_STATS_IDX_TIME_SCH_SERVE] < sch_sched_min) { sch_sched_min = stats[i][DBG_STATS_IDX_TIME_SCH_SERVE]; sch_sched_min_idx = i; } if (stats[i][DBG_STATS_IDX_TIME_SCH_SERVE] > sch_sched_max) { sch_sched_max = stats[i][DBG_STATS_IDX_TIME_SCH_SERVE]; sch_sched_max_idx = i; } } } sch_sched_avg = sch_sched_tot / context->pr_num_schedulers; dbg_stats_format(sch_sched_avg, &sch_sched_avg_dec, &sch_sched_avg_frac, &sch_sched_avg_units); dbg_stats_format(sch_sched_min, &sch_sched_min_dec, &sch_sched_min_frac, &sch_sched_min_units); dbg_stats_format(sch_sched_max, &sch_sched_max_dec, &sch_sched_max_frac, &sch_sched_max_units); // Worker wait time for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_work_ids[i] > -1) { wrk_wait_tot += stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT]; if (stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT] < wrk_wait_min) { wrk_wait_min = stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT]; wrk_wait_min_idx = i; } if (stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT] > wrk_wait_max) { wrk_wait_max = stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT]; wrk_wait_max_idx = i; } } } wrk_wait_avg = wrk_wait_tot / context->pr_num_workers; dbg_stats_format(wrk_wait_avg, &wrk_wait_avg_dec, &wrk_wait_avg_frac, &wrk_wait_avg_units); dbg_stats_format(wrk_wait_min, &wrk_wait_min_dec, &wrk_wait_min_frac, &wrk_wait_min_units); dbg_stats_format(wrk_wait_max, &wrk_wait_max_dec, &wrk_wait_max_frac, &wrk_wait_max_units); // Scheduler & worker local tasks for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_sched_ids[i] > -1) { sch_tasks_tot += stats[i][DBG_STATS_IDX_NUM_TASKS]; if (stats[i][DBG_STATS_IDX_NUM_TASKS] < sch_tasks_min) { sch_tasks_min = stats[i][DBG_STATS_IDX_NUM_TASKS]; sch_tasks_min_idx = i; } if (stats[i][DBG_STATS_IDX_NUM_TASKS] > sch_tasks_max) { sch_tasks_max = stats[i][DBG_STATS_IDX_NUM_TASKS]; sch_tasks_max_idx = i; } } else { wrk_tasks_tot += stats[i][DBG_STATS_IDX_NUM_TASKS]; if (stats[i][DBG_STATS_IDX_NUM_TASKS] < wrk_tasks_min) { wrk_tasks_min = stats[i][DBG_STATS_IDX_NUM_TASKS]; wrk_tasks_min_idx = i; } if (stats[i][DBG_STATS_IDX_NUM_TASKS] > wrk_tasks_max) { wrk_tasks_max = stats[i][DBG_STATS_IDX_NUM_TASKS]; wrk_tasks_max_idx = i; } } } sch_tasks_avg = sch_tasks_tot / context->pr_num_schedulers; wrk_tasks_avg = wrk_tasks_tot / context->pr_num_workers; dbg_stats_format(sch_tasks_avg, &sch_tasks_avg_dec, &sch_tasks_avg_frac, &sch_tasks_avg_units); dbg_stats_format(sch_tasks_min, &sch_tasks_min_dec, &sch_tasks_min_frac, &sch_tasks_min_units); dbg_stats_format(sch_tasks_max, &sch_tasks_max_dec, &sch_tasks_max_frac, &sch_tasks_max_units); dbg_stats_format(wrk_tasks_avg, &wrk_tasks_avg_dec, &wrk_tasks_avg_frac, &wrk_tasks_avg_units); dbg_stats_format(wrk_tasks_min, &wrk_tasks_min_dec, &wrk_tasks_min_frac, &wrk_tasks_min_units); dbg_stats_format(wrk_tasks_max, &wrk_tasks_max_dec, &wrk_tasks_max_frac, &wrk_tasks_max_units); // Scheduler & worker number of messages for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_sched_ids[i] > -1) { sch_msg_tot += stats[i][DBG_STATS_IDX_NUM_MESSAGES]; if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] < sch_msg_min) { sch_msg_min = stats[i][DBG_STATS_IDX_NUM_MESSAGES]; sch_msg_min_idx = i; } if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] > sch_msg_max) { sch_msg_max = stats[i][DBG_STATS_IDX_NUM_MESSAGES]; sch_msg_max_idx = i; } } else { wrk_msg_tot += stats[i][DBG_STATS_IDX_NUM_MESSAGES]; if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] < wrk_msg_min) { wrk_msg_min = stats[i][DBG_STATS_IDX_NUM_MESSAGES]; wrk_msg_min_idx = i; } if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] > wrk_msg_max) { wrk_msg_max = stats[i][DBG_STATS_IDX_NUM_MESSAGES]; wrk_msg_max_idx = i; } } } sch_msg_avg = sch_msg_tot / context->pr_num_schedulers; wrk_msg_avg = wrk_msg_tot / context->pr_num_workers; dbg_stats_format(sch_msg_avg, &sch_msg_avg_dec, &sch_msg_avg_frac, &sch_msg_avg_units); dbg_stats_format(sch_msg_min, &sch_msg_min_dec, &sch_msg_min_frac, &sch_msg_min_units); dbg_stats_format(sch_msg_max, &sch_msg_max_dec, &sch_msg_max_frac, &sch_msg_max_units); dbg_stats_format(wrk_msg_avg, &wrk_msg_avg_dec, &wrk_msg_avg_frac, &wrk_msg_avg_units); dbg_stats_format(wrk_msg_min, &wrk_msg_min_dec, &wrk_msg_min_frac, &wrk_msg_min_units); dbg_stats_format(wrk_msg_max, &wrk_msg_max_dec, &wrk_msg_max_frac, &wrk_msg_max_units); // Worker number of DMAs for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_work_ids[i] > -1) { wrk_ndma_tot += stats[i][DBG_STATS_IDX_NUM_DMAS]; if (stats[i][DBG_STATS_IDX_NUM_DMAS] < wrk_ndma_min) { wrk_ndma_min = stats[i][DBG_STATS_IDX_NUM_DMAS]; wrk_ndma_min_idx = i; } if (stats[i][DBG_STATS_IDX_NUM_DMAS] > wrk_ndma_max) { wrk_ndma_max = stats[i][DBG_STATS_IDX_NUM_DMAS]; wrk_ndma_max_idx = i; } } } wrk_ndma_avg = wrk_ndma_tot / context->pr_num_workers; dbg_stats_format(wrk_ndma_avg, &wrk_ndma_avg_dec, &wrk_ndma_avg_frac, &wrk_ndma_avg_units); dbg_stats_format(wrk_ndma_min, &wrk_ndma_min_dec, &wrk_ndma_min_frac, &wrk_ndma_min_units); dbg_stats_format(wrk_ndma_max, &wrk_ndma_max_dec, &wrk_ndma_max_frac, &wrk_ndma_max_units); // Worker DMA size for (i = 0; i < context->pr_num_cores; i++) { if (context->pr_core_work_ids[i] > -1) { wrk_sdma_tot += stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE]; if (stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE] < wrk_sdma_min) { wrk_sdma_min = stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE]; wrk_sdma_min_idx = i; } if (stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE] > wrk_sdma_max) { wrk_sdma_max = stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE]; wrk_sdma_max_idx = i; } } } wrk_sdma_avg = wrk_sdma_tot / context->pr_num_workers; dbg_stats_format(wrk_sdma_avg, &wrk_sdma_avg_dec, &wrk_sdma_avg_frac, &wrk_sdma_avg_units); dbg_stats_format(wrk_sdma_min, &wrk_sdma_min_dec, &wrk_sdma_min_frac, &wrk_sdma_min_units); dbg_stats_format(wrk_sdma_max, &wrk_sdma_max_dec, &wrk_sdma_max_frac, &wrk_sdma_max_units); // Print summary to buffer buf = kt_malloc(SUMMARY_BUF_SIZE * sizeof(char)); s = buf; s += kt_sprintf(s, "=============================================================================\r\n" " Statistics Summary\r\n" "=============================================================================\r\n" "\r\n" " Schedulers Workers\r\n" " ---------------- -------------\r\n" "\r\n" "Idle time: %3d.%d %c (avg) Idle time: %3d.%d %c (avg)\r\n" " %3d.%d %c (min, core %3d) %3d.%d %c (min, core %3d)\r\n" " %3d.%d %c (max, core %3d) %3d.%d %c (max, core %3d)\r\n" "\r\n" "Mem time: %3d.%d %c (avg) Work time: %3d.%d %c (avg)\r\n" " %3d.%d %c (min, core %3d) %3d.%d %c (min, core %3d)\r\n" " %3d.%d %c (max, core %3d) %3d.%d %c (max, core %3d)\r\n" "\r\n" "Non-mem time: %3d.%d %c (avg) Wait time: %3d.%d %c (avg)\r\n" " %3d.%d %c (min, core %3d) %3d.%d %c (min, core %3d)\r\n" " %3d.%d %c (max, core %3d) %3d.%d %c (max, core %3d)\r\n" "\r\n" "Local tasks: %3d.%d %c (avg) Local tasks: %3d.%d %c (avg)\r\n" " %3d.%d %c (min, core %3d) %3d.%d %c (min, core %3d)\r\n" " %3d.%d %c (max, core %3d) %3d.%d %c (max, core %3d)\r\n" "\r\n" "Num messages: %3d.%d %c (avg) Num messages: %3d.%d %c (avg)\r\n" " %3d.%d %c (min, core %3d) %3d.%d %c (min, core %3d)\r\n" " %3d.%d %c (max, core %3d) %3d.%d %c (max, core %3d)\r\n" "\r\n" " Num DMAs: %3d.%d %c (avg)\r\n" " %3d.%d %c (min, core %3d)\r\n" " %3d.%d %c (max, core %3d)\r\n" "\r\n" " DMAed data: %3d.%d %c (avg)\r\n" " %3d.%d %c (min, core %3d)\r\n" " %3d.%d %c (max, core %3d)\r\n" "=============================================================================\r\n", sch_idle_avg_dec, sch_idle_avg_frac, sch_idle_avg_units, wrk_idle_avg_dec, wrk_idle_avg_frac, wrk_idle_avg_units, sch_idle_min_dec, sch_idle_min_frac, sch_idle_min_units, sch_idle_min_idx, wrk_idle_min_dec, wrk_idle_min_frac, wrk_idle_min_units, wrk_idle_min_idx, sch_idle_max_dec, sch_idle_max_frac, sch_idle_max_units, sch_idle_max_idx, wrk_idle_max_dec, wrk_idle_max_frac, wrk_idle_max_units, wrk_idle_max_idx, sch_mem_avg_dec, sch_mem_avg_frac, sch_mem_avg_units, wrk_work_avg_dec, wrk_work_avg_frac, wrk_work_avg_units, sch_mem_min_dec, sch_mem_min_frac, sch_mem_min_units, sch_mem_min_idx, wrk_work_min_dec, wrk_work_min_frac, wrk_work_min_units, wrk_work_min_idx, sch_mem_max_dec, sch_mem_max_frac, sch_mem_max_units, sch_mem_max_idx, wrk_work_max_dec, wrk_work_max_frac, wrk_work_max_units, wrk_work_max_idx, sch_sched_avg_dec, sch_sched_avg_frac, sch_sched_avg_units, wrk_wait_avg_dec, wrk_wait_avg_frac, wrk_wait_avg_units, sch_sched_min_dec, sch_sched_min_frac, sch_sched_min_units, sch_sched_min_idx, wrk_wait_min_dec, wrk_wait_min_frac, wrk_wait_min_units, wrk_wait_min_idx, sch_sched_max_dec, sch_sched_max_frac, sch_sched_max_units, sch_sched_max_idx, wrk_wait_max_dec, wrk_wait_max_frac, wrk_wait_max_units, wrk_wait_max_idx, sch_tasks_avg_dec, sch_tasks_avg_frac, sch_tasks_avg_units, wrk_tasks_avg_dec, wrk_tasks_avg_frac, wrk_tasks_avg_units, sch_tasks_min_dec, sch_tasks_min_frac, sch_tasks_min_units, sch_tasks_min_idx, wrk_tasks_min_dec, wrk_tasks_min_frac, wrk_tasks_min_units, wrk_tasks_min_idx, sch_tasks_max_dec, sch_tasks_max_frac, sch_tasks_max_units, sch_tasks_max_idx, wrk_tasks_max_dec, wrk_tasks_max_frac, wrk_tasks_max_units, wrk_tasks_max_idx, sch_msg_avg_dec, sch_msg_avg_frac, sch_msg_avg_units, wrk_msg_avg_dec, wrk_msg_avg_frac, wrk_msg_avg_units, sch_msg_min_dec, sch_msg_min_frac, sch_msg_min_units, sch_msg_min_idx, wrk_msg_min_dec, wrk_msg_min_frac, wrk_msg_min_units, wrk_msg_min_idx, sch_msg_max_dec, sch_msg_max_frac, sch_msg_max_units, sch_msg_max_idx, wrk_msg_max_dec, wrk_msg_max_frac, wrk_msg_max_units, wrk_msg_max_idx, wrk_ndma_avg_dec, wrk_ndma_avg_frac, wrk_ndma_avg_units, wrk_ndma_min_dec, wrk_ndma_min_frac, wrk_ndma_min_units, wrk_ndma_min_idx, wrk_ndma_max_dec, wrk_ndma_max_frac, wrk_ndma_max_units, wrk_ndma_max_idx, wrk_sdma_avg_dec, wrk_sdma_avg_frac, wrk_sdma_avg_units, wrk_sdma_min_dec, wrk_sdma_min_frac, wrk_sdma_min_units, wrk_sdma_min_idx, wrk_sdma_max_dec, wrk_sdma_max_frac, wrk_sdma_max_units, wrk_sdma_max_idx ); if (s - buf > SUMMARY_BUF_SIZE) { ar_panic("Summary buffer overflow"); } // Print it ar_uart_flush(); ar_timer_busy_wait_msec(200); kt_printf("\r\n%s\r\n", buf); // ========================================================================= // Dump both the summary and all the per-core analytical stats to a file // ========================================================================= // Print file begin dump header ar_uart_flush(); ar_timer_busy_wait_msec(200); kt_printf(DBG_FILE_DUMP_BEGIN_FORMAT, filename); ar_uart_flush(); ar_timer_busy_wait_msec(20); // Print the summary kt_printf("%s\r\n", buf); // For all cores in the setup for (i = 0; i < context->pr_num_cores; i++) { // Print the analytical stats kt_printf("==================================================\r\n"); kt_printf("Analytical Statistics for Core %d [%s]\r\n", i, (context->pr_core_sched_ids[i] > -1) ? "Scheduler" : "Worker"); kt_printf("==================================================\r\n"); for (j = 0; j < DBG_STATS_NUM_STATS; j++) { // Print relevant fields if ((context->pr_core_sched_ids[i] > -1) && ((j == 1) || (j == 2) || (j == 7) || (j == 8))) { continue; } dbg_stats_format_number(fmt_buf, stats[i][j]); kt_printf("%s %15s\r\n", (j == 0) ? "Idle time: " : (j == 1) ? "Work time: " : (j == 2) ? "Wait time: " : (j == 3) ? "Mem time: " : (j == 4) ? "Non-mem time: " : (j == 5) ? "Local tasks: " : (j == 6) ? "Num messages: " : (j == 7) ? "Num DMAs: " : (j == 8) ? "DMAed data: " : "!!!ERROR!!!", fmt_buf); } kt_printf("==================================================\r\n\r\n"); } // Print file end dump header ar_uart_flush(); ar_timer_busy_wait_msec(200); kt_printf(DBG_FILE_DUMP_END_FORMAT, filename); ar_uart_flush(); ar_timer_busy_wait_msec(20); // Free stuff kt_free(stats); kt_free(buf); #endif }
// =========================================================================== // kt_malloc() Kernel basic allocation function. Allocates // serially in predefined places (see above) // during bootstrap, or calls the slab allocator // out of bootstrap. // =========================================================================== // * INPUTS // size_t size Number of bytes to be allocated (can be 0) // // * RETURN VALUE // void * Pointer to new allocated object. Note that // NULL will not be returned for a non-zero size // -- out of memory in kernel space will trigger // an abort. // =========================================================================== void *kt_malloc(size_t size) { Context *context; int my_cid; size_t kernel_base; size_t kernel_end; int *bootstrap_slots; size_t ptr; int i; // Allow dummy mallocs if (!size) { return NULL; } // Clamp requests up to 2-GB size. We won't support more, even for the // x86_64 port. It makes it easier to work internally with signed integers, // because error checking and assertions work way better. ar_assert (size < (1 << 31)); // Get global context and boundaries my_cid = ar_get_core_id(); context = mm_get_context(my_cid); kernel_base = mm_va_kernel_base(my_cid); kernel_end = kernel_base + MM_KERNEL_SIZE - 1024 * 1024; // Align size request to nearest allowed size if (size & (MM_ALLOC_ALIGN - 1)) { size = (size & ~(MM_ALLOC_ALIGN - 1)) + MM_ALLOC_ALIGN; } // Bootstrapping code if (context->mm_alloc_bootstrap) { bootstrap_slots = (int *) kernel_end; // We support only up to MM_BOOTSTRAP_MAX_SLOT sized requests; increase // that if this assertion fails (it means bigger objects are needed and // bootstrap has to allow this) ar_assert(size <= MM_BOOTSTRAP_MAX_SLOT); // Allocate directly from the beginning of the kernel heap, keeping track // on the last heap page i = size / MM_ALLOC_ALIGN - 1; // find slot ptr = kernel_base + // base address i * MM_SLAB_SIZE * MM_BOOTSTRAP_SLABS_STEP + // slabs per slot bootstrap_slots[i] * size; // slot address // Remember how many objects we've allocated bootstrap_slots[i]++; // Make sure enough slots can fit into MM_BOOTSTRAP_SLABS_STEP; otherwise, // the define must be increased ar_assert(bootstrap_slots[i] * size <= MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE); return (void *) ptr; } // Do a normal allocation from the kernel pool if (mm_slab_alloc_slot(context->mm_kernel_pool, size, &ptr)) { // Kernel memory should never get full ar_abort(); } ar_assert(ptr >= kernel_base); ar_assert(ptr < kernel_end + MM_PAGE_SIZE); return (void *) ptr; }
// =========================================================================== // kt_free() Frees an object. During bootstrap, it simply // tracks it for freeing later on. Out of // bootstrap, calls the slab allocator to free // it. // =========================================================================== // * INPUTS // void *ptr Object to be freed // =========================================================================== void kt_free(void *ptr) { Context *context; int my_cid; size_t kernel_base; size_t kernel_end; int *bootstrap_slots; int *counter; void **free_slots; int slot_id; int slot_offset; // Dummy free? if (!ptr) { return; } // Get global context and boundaries my_cid = ar_get_core_id(); context = mm_get_context(my_cid); kernel_base = mm_va_kernel_base(my_cid); kernel_end = kernel_base + MM_KERNEL_SIZE - 1024 * 1024; // Sanity checks ar_assert (!((size_t) ptr & (MM_ALLOC_ALIGN - 1))); ar_assert((size_t) ptr >= kernel_base); ar_assert((size_t) ptr < kernel_end + MM_PAGE_SIZE); // Get global context context = mm_get_context(ar_get_core_id()); // Bootstrapping? if (context->mm_frees_bootstrap) { // Verify it's about a slot we actually gave bootstrap_slots = (int *) kernel_end; slot_id = ((size_t) ptr - kernel_base) / (MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE); ar_assert(slot_id * MM_ALLOC_ALIGN <= MM_BOOTSTRAP_MAX_SLOT); ar_uint_divide((size_t) ptr - kernel_base - (slot_id * MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE), (slot_id + 1) * MM_ALLOC_ALIGN, (unsigned int *) &slot_offset, NULL); ar_assert(slot_offset < bootstrap_slots[slot_id]); // Record the free request counter = (int *) ((size_t) kernel_end - MM_PAGE_SIZE); free_slots = (void **) ((size_t) kernel_end - MM_PAGE_SIZE + sizeof(int)); free_slots[*counter] = ptr; (*counter)++; // Make sure we don't overflow the array ar_assert((MM_PAGE_SIZE - sizeof(int)) / sizeof(void *) > *counter); return; } // Do a normal free from the kernel pool ar_assert(!mm_slab_free_slot(context->mm_kernel_pool, (size_t) ptr)); }
// =========================================================================== // function() FIXME comments // =========================================================================== // * INPUTS // unsigned char *arg1 Describe arg1 // int arg2 Describe arg2 // // * OUTPUTS // int *arg3 Describe arg3 // // * RETURN VALUE // int 0 for success // =========================================================================== int kmeans_mpi(int num_procs, // MPI processors to use int num_clusters, // number of output clusters int num_objects, // number of total input objects int num_reps) { // loop repetitions int num_cores; int rank; int objects_per_core; float *objects; int *membership; float *clusters; float *partial_clusters; int *partial_sizes; float *reduce_clusters; int *reduce_sizes; unsigned int seed = 42; unsigned int time_start = 0; unsigned int time_stop; unsigned int time; int i; int j; int loop; // Who are we? MPI_Comm_size(MPI_COMM_WORLD, &num_cores); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Sanity checks if (num_cores < num_procs) { if (!rank) { kt_printf("Cannot run with %d cores, MPI setup has only %d cores\r\n", num_procs, num_cores); } return 1; } if (num_objects % num_procs) { kt_printf("%d objects not divisible by %d cores\r\n", num_objects, num_procs); return 1; } objects_per_core = num_objects / num_procs; // Synchronize everyone and print infomercial MPI_Barrier(MPI_COMM_WORLD); if (!rank) { kt_printf("k-means of %d -> %d starting on %d core(s)\r\n", num_objects, num_clusters, num_procs); } MPI_Barrier(MPI_COMM_WORLD); // Create random input objects per core objects = kt_malloc(objects_per_core * COORDS * sizeof(float)); for (i = 0; i < objects_per_core; i++) { for (j = 0; j < COORDS; j++) { objects[i * COORDS + j] = (float) ((seed = kt_rand(seed)) % 1000) / 10.0F; } } // Create membership, no object belongs to any cluster yet membership = kt_malloc(objects_per_core * sizeof(int)); for (i = 0; i < objects_per_core; i++) { membership[i] = -1; } // Create partial cluster arrays, to be used as temporary buffers, and their // size arrays. Init everything to a neutral value, so they can be used // for reductions even with cores that are not part of the setup. partial_clusters = kt_malloc(num_clusters * COORDS * sizeof(float)); partial_sizes = kt_malloc(num_clusters * sizeof(int)); for (i = 0; i < num_clusters; i++) { partial_sizes[i] = 0; for (j = 0; j < COORDS; j++) { partial_clusters[i * COORDS + j] = 0.0F; } } // Rank 0 allocates extra buffers for the reductions if (!rank) { reduce_clusters = kt_malloc(num_clusters * COORDS * sizeof(float)); reduce_sizes = kt_malloc(num_clusters * sizeof(int)); } else { reduce_clusters = NULL; reduce_sizes = NULL; } // Allocate stable clusters (will be changed across loop repetitions) for // everyone. Prepare initial clusters for core 0 with centers copied from // first objects. clusters = kt_malloc(num_clusters * COORDS * sizeof(float *)); if (!rank) { ar_assert(objects_per_core >= num_clusters); for (i = 0; i < num_clusters; i++) { for (j = 0; j < COORDS; j++) { clusters[i * COORDS + j] = objects[i * COORDS + j]; } } } // Keep time MPI_Barrier(MPI_COMM_WORLD); if (!rank) { time_start = ar_free_timer_get_ticks(); } // For all repetitions for (loop = 0; loop < num_reps; loop++) { // Broadcast clusters for this rep from core 0 to everyone MPI_Bcast(clusters, num_clusters * COORDS, MPI_FLOAT, 0, MPI_COMM_WORLD); // Process our objects (only for cores belonging in the setup). if (rank < num_procs) { kmeans_mpi_do_tile(objects, membership, partial_clusters, partial_sizes, clusters, objects_per_core, num_clusters); } // Reduce all results to rank 0 (non-setup cores will add their 0 values) MPI_Reduce(partial_clusters, reduce_clusters, num_clusters * COORDS, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(partial_sizes, reduce_sizes, num_clusters, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); // Rank 0 prepares the next clusters, based on reduction values if (!rank) { kt_printf("done %d of %d\r\n", loop + 1, num_reps); for (i = 0; i < num_clusters; i++) { for (j = 0; j < COORDS; j++) { clusters[i * COORDS + j] = reduce_clusters[i * COORDS + j] / (float) reduce_sizes[i]; } } } } MPI_Barrier(MPI_COMM_WORLD); // Compute elapsed time if (!rank) { time_stop = ar_free_timer_get_ticks(); if (time_stop > time_start) { time = time_stop - time_start; } else { time = 0xFFFFFFFF - (time_start - time_stop); } kt_printf("Time: %10u cycles (%6u msec)\r\n", time, time / 10000); // Print clusters for (i = 0; i < num_clusters; i++) { kt_printf("Cluster %d: %f %f %f\r\n", i, clusters[i * COORDS], clusters[i * COORDS + 1], clusters[i * COORDS + 2]); } } // Free stuff kt_free(clusters); kt_free(partial_sizes); kt_free(partial_clusters); kt_free(reduce_sizes); kt_free(reduce_clusters); kt_free(objects); kt_free(membership); return 0; }
// =========================================================================== // _sys_spawn() Spawns a new task for parallel execution. // =========================================================================== // * INPUTS // char *filename Source code filename where this call is done // int line_nr Line number in filename this call is done // unsigned int idx Which task to run, specified as an index // to a global function pointer table named // void (*_sys_task_table[]) // void **args Array of in-order task arguments. All arguments // must fit into a void * typecast. // unsigned int *types Array of in-order argument types. Each type // is a bitmask of SYS_TYPE_* properties as // defined in syscall.h header. // unsigned int num_args Number of task arguments // =========================================================================== void _sys_spawn(char *filename, int line_nr, unsigned int idx, void **args, unsigned int *types, unsigned int num_args) { Context *context; PrMsgReq *req; ListNode *node; PrTaskDescr *ptask; PrEvtPending *event; int i; int j; // Get context context = mm_get_context(ar_get_core_id()); // What's the parent task? node = kt_list_head(context->pr_ready_queue); ar_assert(node); ptask = node->data; ar_assert(ptask); // Sanity checks ar_assert(idx < (1 << PR_TASK_IDX_SIZE)); ar_assert(num_args > 0); // No task should be declared without a memory // footprint. If for some reason this is desirable, // the while loop below must be altered (it won't // work for num_args == 0). //kt_printf("%d: Parent task = 0x%X (idx = %d), spawning task idx = %d\r\n", // context->pr_core_id, ptask->id, ptask->index, idx); // Single-core mode? if (context->pr_num_cores == 1) { // FIXME: don't involve scheduler, ar_exec() directly... ar_abort(); } // Make sure we haven't got any previous spawn request pending. We throttle // the spawn rate here, in order to avoid race conditions: we are allowed // to spawn a new task only when the scheduler has signalled (through a // REPLY_SPAWN) that the old task spawn has progressed up to a safe point // which will not create any race. if (context->pr_spawn_pending) { ar_assert(!pr_event_worker_inner_loop(1, 0, 0, NULL)); } // Mark that a new spawn request is now pending ar_assert(!context->pr_spawn_pending); context->pr_spawn_pending = 1; // Enter multi-part request creation loop i = 0; j = 0; req = NULL; while (i < num_args) { // New message part? if (!j) { // Get a request buffer req = noc_msg_send_get_buf(pr_scheduler_core_id( context->pr_parent_sched_id)); // Build new message req->core_id = context->pr_core_id; req->req_id = context->pr_message_id; // same ID for all parts req->type = EXT_REQ_SPAWN; req->size = 0; // init I/O type bitmap req->region = 0; // init region/byvalue type bitmap req->ptr = (void *) ((ptask->id << PR_TASK_IDX_SIZE) | idx); req->num_regions = (i != 0); // new request part? } // Fill out next argument req->data[j] = args[i]; req->size |= (types[i] & SYS_TYPE_IN_ARG) ? (1 << (2 * j)) : 0; req->size |= (types[i] & SYS_TYPE_OUT_ARG) ? (1 << (2 * j + 1)) : 0; req->region |= (types[i] & SYS_TYPE_SAFE_ARG) ? (1 << (2 * j)) : 0; req->region |= (types[i] & SYS_TYPE_REGION_ARG) ? (1 << (2 * j + 1)) : 0; //kt_printf("Spawn arg %d: %s %s %s %s\r\n", i, // (types[i] & SYS_TYPE_IN_ARG) ? "IN" : "", // (types[i] & SYS_TYPE_OUT_ARG) ? "OUT" : "", // (types[i] & SYS_TYPE_SAFE_ARG) ? "SAFE" : "", // (types[i] & SYS_TYPE_REGION_ARG) ? "REGION" : "" // ); i++; j++; // Finished with this part? if ((j == PR_REQ_MAX_SIZE) || (i == num_args)) { // More to follow? if ((j == PR_REQ_MAX_SIZE) && (i < num_args)) { req->num_ptrs = -1; } else { req->num_ptrs = j; } // Send message to scheduler ar_assert(!noc_msg_send()); // Finished with this request buffer j = 0; req = NULL; } } // Create a note-to-self event to wait for the reply of this spawn request event = kt_malloc(sizeof(PrEvtPending)); event->req = kt_malloc(sizeof(PrMsgReq)); event->req->core_id = -1; event->req->req_id = -1; event->req->type = SELF_WAIT_SPAWN; event->action = PR_ACT_REDO; event->prev = NULL; event->next = NULL; event->data = NULL; // Store event; we don't expect conflicts on this message ID. ar_assert(!kt_trie_insert(context->pr_pending_events, context->pr_message_id, event)); // Increase message ID, avoiding value 0 on wrap-arounds context->pr_message_id = pr_advance_msg_id(context->pr_message_id); }