Exemplo n.º 1
0
// ===========================================================================
// mm_distr_object_sched_id()   Finds out which scheduler is responsible for
//                              a given object. This is approximate: it 
//                              indicates the direction to be searched
//                              (one level up or one level down). A query
//                              must be made at that level to learn more.
// ===========================================================================
// * INPUTS
//   void *ptr                  The object we're searching for
//
// * RETURN VALUE
//   int                        scheduler_id on success
//                              ERR_OUT_OF_RANGE: pointer does not exist
//                                 anywhere in the system
// ===========================================================================
int mm_distr_object_sched_id(void *ptr) {

  Context       *context;
  MmRgnAdrRange *range;


  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(ptr);
  ar_assert(context->mm_used_ranges);

  // See if pointer belongs to our scheduler subtree. We have to check that,
  // because direction of search initially begins upwards (from child
  // scheduler to parent scheduler) but later on, if it passes the top-level,
  // it becomes downwards (from parent scheduler to child scheduler) until it
  // reaches the correct scheduler.
  kt_trie_find_approx(context->mm_used_ranges, 0, (size_t) ptr, 
                      (void *) &range);
  if (range) {
    // We found something, but does it contain the pointer?
    ar_assert(range->address <= (size_t) ptr);
    if ((size_t) ptr >= range->address + range->num_slabs * MM_SLAB_SIZE) {
      range = NULL;
    }
  }

  // We know nothing about it
  if (!range) {

    // Are we the top-level scheduler?
    if (context->pr_parent_sched_id == -1) {
      // No such pointer exists anywhere in the system
      return ERR_OUT_OF_RANGE;
    }

    // Otherwise, our parent scheduler should know
    else {
      return context->pr_parent_sched_id;
    }
  }

  // Region belongs to our subtree
  else {

    // Return appropriate child scheduler
    return range->sched_id;
  }
}
Exemplo n.º 2
0
// ===========================================================================
// mm_distr_region_sched_id()   Finds out to which scheduler is responsible
//                              for a given region. This is approximate: it 
//                              indicates the direction to be searched
//                              (one level up or one level down). A query
//                              must be made at that level to learn more.
// ===========================================================================
// * INPUTS
//   rid_t region               The region ID we're searching for
//
// * RETURN VALUE
//   int                        scheduler_id on success
//                              ERR_NO_SUCH_REGION: region does not exist
//                                 anywhere in the system
// ===========================================================================
int mm_distr_region_sched_id(rid_t region) {

  Context       *context;
  MmRgnRidRange *rid_range;

  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(region);
  ar_assert(context->mm_used_rids);

  // See if region belongs to our scheduler subtree. We have to check that,
  // because direction of search initially begins upwards (from child
  // scheduler to parent scheduler) but later on, if it passes the top-level,
  // it becomes downwards (from parent scheduler to child scheduler) until it
  // reaches the correct scheduler.
  kt_trie_find_approx(context->mm_used_rids, 0, region, (void *) &rid_range);
  if (rid_range) {
    // We found something, but does it contain this region ID?
    ar_assert(region >= rid_range->rid);
    if (region >= rid_range->rid + rid_range->num_rids) {
      rid_range = NULL;
    }
  }

  // We know nothing about it
  if (!rid_range) {

    // Are we the top-level scheduler?
    if (context->pr_parent_sched_id == -1) {
      // No such region exists anywhere in the system
      return ERR_NO_SUCH_REGION;
    }

    // Otherwise, our parent scheduler should know more
    else {
      return context->pr_parent_sched_id;
    }
  }

  // Region belongs to our subtree
  else {

    // Return appropriate child scheduler
    return rid_range->sched_id;
  }
}
Exemplo n.º 3
0
// ===========================================================================
// mm_distr_balloc()            Requests a new bulk-objects allocation in a 
//                              remote region from the appropriate scheduler
// ===========================================================================
// * INPUTS
//   size_t size                Size of the new object
//   rid_t region               Remote region ID of the new object
//   int num_elements           Number of objects to bulk-allocate
//
// * RETURN VALUE
//   int                        Message ID of the new request on success
//                              ERR_NO_SUCH_REGION: failure, we are sure
//                                 that this region ID does not exist in the
//                                 whole system
// ===========================================================================
int mm_distr_balloc(size_t size, rid_t region, int num_elements) {
                    
  Context               *context;
  int                   sched_id;
  PrMsgReq              *new_req;
  int                   id;


  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(size);
  ar_assert(region);
  ar_assert(num_elements);

  // Find which scheduler should we ask for this region
  sched_id = mm_distr_region_sched_id(region);
  if (sched_id == ERR_NO_SUCH_REGION) {

    // No such region exists anywhere in the system
    return ERR_NO_SUCH_REGION;
  }
  // It can't be ours, local functions should have picked it up
  ar_assert(sched_id != context->pr_scheduler_id);

  // Build message
  new_req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id));

  new_req->core_id = context->pr_core_id;
  new_req->req_id  = context->pr_message_id;
  new_req->type    = REQ_BALLOC;
  new_req->size    = size;
  new_req->region  = region;
  new_req->ptr     = (void *) ((size_t) num_elements);
  
  // Send message to the selected scheduler
  ar_assert(!noc_msg_send());

  // Increase message ID
  id = context->pr_message_id;
  context->pr_message_id = pr_advance_msg_id(context->pr_message_id);

  // Success
  return id;
}
Exemplo n.º 4
0
// ===========================================================================
// mm_distr_rfree_update_parent()  Requests the updating of a remote parent
//                                 as to the loss of one of its children
// ===========================================================================
// * INPUTS
//   rid_t parent               Remote region ID of parent
//   rid_t child                Region ID of the child that is deleted
//
// * RETURN VALUE
//   int                        Message ID of the new request on success
// ===========================================================================
int mm_distr_rfree_update_parent(rid_t parent, rid_t child) {

  Context               *context;
  int                   sched_id;
  PrMsgReq              *new_req;
  int                   id;


  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(parent);
  ar_assert(child);

  // Find which scheduler should we ask for the parent
  sched_id = mm_distr_region_sched_id(parent);
  if (sched_id == ERR_NO_SUCH_REGION) {
    // Corruption, we should have a valid parent ID
    ar_abort();
  }
  // It can't be ours, local functions should have picked it up
  ar_assert(sched_id != context->pr_scheduler_id);

  // Build message
  new_req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id));

  new_req->core_id = context->pr_core_id;
  new_req->req_id  = context->pr_message_id;
  new_req->type    = REQ_RFREE_UPDATE_PARENT;
  new_req->region  = parent;
  new_req->ptr     = (void *) child;
  
  // Send message to the selected scheduler
  ar_assert(!noc_msg_send());

  // Increase message ID
  id = context->pr_message_id;
  context->pr_message_id = pr_advance_msg_id(context->pr_message_id);

  // Success
  return id;
}
Exemplo n.º 5
0
// ===========================================================================
// mm_distr_free()              Requests a new bulk-objects allocation in a 
//                              remote region from the appropriate scheduler
// ===========================================================================
// * INPUTS
//   void *ptr                  The remote object that must be freed
//
// * RETURN VALUE
//   int                        Message ID of the new request on success
//                              ERR_OUT_OF_RANGE: failure, we are sure
//                                 that this pointer is not handled by anyone
//                                 in the whole system
// ===========================================================================
int mm_distr_free(void *ptr) {

  Context               *context;
  int                   sched_id;
  PrMsgReq              *new_req;
  int                   id;


  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(ptr);

  // Find out which scheduler should we ask for this object
  sched_id = mm_distr_object_sched_id(ptr);
  if (sched_id == ERR_OUT_OF_RANGE) {

    // No such object exists anywhere in the system
    return ERR_OUT_OF_RANGE;
  }
  // It can't be ours, local functions should have picked it up
  ar_assert(sched_id != context->pr_scheduler_id);

  // Build message
  new_req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id));

  new_req->core_id = context->pr_core_id;
  new_req->req_id  = context->pr_message_id;
  new_req->type    = REQ_FREE;
  new_req->ptr     = ptr;
  
  // Send message to the selected scheduler
  ar_assert(!noc_msg_send());

  // Increase message ID
  id = context->pr_message_id;
  context->pr_message_id = pr_advance_msg_id(context->pr_message_id);

  // Success
  return id;
}
Exemplo n.º 6
0
// ===========================================================================
// mm_distr_get_rids()          Requests more free region IDs from our parent 
//                              scheduler
// ===========================================================================
// * INPUTS
//   int num_rids               Number of region IDs we need
//
// * RETURN VALUE
//   int                        Message ID of the new request on success
//                              ERR_OUT_OF_MEMORY: failure, no more memory
//                                 in the whole system
// ===========================================================================
int mm_distr_get_rids(int num_rids) {

  Context               *context;
  PrMsgReq              *new_req;
  int                   id;


  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(num_rids);

  // Are we the top-level scheduler?
  if (context->pr_parent_sched_id == -1) {

    // No more region IDs in the whole system
    return ERR_OUT_OF_RIDS;
  }

  // Build message
  new_req = noc_msg_send_get_buf(pr_scheduler_core_id(
                                                context->pr_parent_sched_id));
  new_req->core_id = context->pr_core_id;
  new_req->req_id  = context->pr_message_id;
  new_req->type    = REQ_GET_RIDS;
  new_req->size    = num_rids;
  
  // Send message to parent
  ar_assert(!noc_msg_send());
  
  // Increase message ID
  id = context->pr_message_id;
  context->pr_message_id = pr_advance_msg_id(context->pr_message_id);

  // Success
  return id;
}
Exemplo n.º 7
0
Arquivo: genarr.c Projeto: spchamp/ilu
PUBLIC char * ar_typeDeclarator(Type t) {
    char* 		buf;
    TypeDescription	d;
    Type		eType;
    list		dimList;
    int			nDims;
    int			i;
    t = ar_assert(t);
    d = t->description;
    eType = d->structuredDes.array.type;
    dimList = d->structuredDes.array.dimensions;
    nDims = list_size(dimList);
    buf = typeDeclarator(eType);
    for (i = 0; i < nDims; i++) {
        buf = cat2(buf, "[]");
    }
    return buf;
}
Exemplo n.º 8
0
// ===========================================================================
// function()                   FIXME comments
// ===========================================================================
// * INPUTS
//   unsigned char *arg1        Describe arg1
//   int arg2                   Describe arg2
//
// * OUTPUTS
//   int *arg3                  Describe arg3
//
// * RETURN VALUE
//   int                        0 for success
// ===========================================================================
void dbg_stats_init() {
#ifdef DBG_STATS_ENABLED

  Context       *context;
  unsigned int  i;


  // Get context
  context = mm_get_context(ar_get_core_id());

  // Allocate array
  ar_assert(!context->dbg_stats_data);
  context->dbg_stats_data = kt_malloc(DBG_STATS_NUM_STATS * sizeof(int));

  // Zero them all
  for (i = 0; i < DBG_STATS_NUM_STATS; i++) {
    context->dbg_stats_data[i] = 0;
  }

  // Init the "last timer" value to something meaningful
  context->dbg_stats_last_tmr = ar_free_timer_get_ticks();

#endif
}
Exemplo n.º 9
0
// ===========================================================================
// kt_mem_init()                Initializes the kernel memory allocator,
//                              bootstraps it, creates the kernel slab pool
//                              and allocates/initalizes the global Context.
// ===========================================================================
void kt_mem_init() {

  Context       *context;
  int           my_cid;
  size_t        kernel_base;
  size_t        kernel_end;
  int           i;


  // Get core ID and kernel limits
  my_cid = ar_get_core_id();
  kernel_base = mm_va_kernel_base(my_cid);
  kernel_end = kernel_base + MM_KERNEL_SIZE - 1024 * 1024;
  

  // Prepare bootstrapping structures on 2 last kernel heap pages. We are
  // going to keep track of how many objects we allocate per slab slot size,
  // to a maximum of MM_BOOTSTRAP_MAX_SLOT, as well as which of these
  // objects are freed during bootstrap.
  ar_assert(MM_BOOTSTRAP_MAX_SLOT % MM_ALLOC_ALIGN == 0);
  for (i = 0; i < (MM_BOOTSTRAP_MAX_SLOT / MM_ALLOC_ALIGN); i++) {
    // A counter for alloc() calls
    ((int *) kernel_end)[i] = 0;
  }
  // Single counter for free() calls
  *((int *) (kernel_end - MM_PAGE_SIZE)) = 0;


  // Make sure we have enough space from the maximum number of objects during
  // bootstrap, so that they do not reach the two last kernel space pages
  // which are used for the counters above
  ar_assert((MM_BOOTSTRAP_MAX_SLOT / MM_ALLOC_ALIGN) *
              MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE <  // max bootstrapped adr
             kernel_end - MM_PAGE_SIZE);                // free() counter


  // Prepare context
  context = mm_get_context(my_cid);

  context->dbg_trc_data = NULL;
  context->dbg_trc_idx = 0;
  context->dbg_trc_time_start = 0;
  context->dbg_trc_offset = 0;
  
  context->dbg_stats_data = NULL;
  context->dbg_stats_last_tmr = 0;

  context->noc_mode = -1;
  context->noc_cnt_free = NULL;
  context->noc_send_buf = NULL;
  context->noc_recv_buf = NULL;
  context->noc_msg_core_id = -1;
  context->noc_credits = NULL;
  context->noc_credits_poll = NULL;
  context->noc_num_peers = 0;
  context->noc_poll_rr = 0;
  context->noc_active_dmas = NULL;
#ifdef NOC_WARN_OUT_OF_CREDITS
  context->noc_cred_warned = 0;
#endif

  context->mm_alloc_bootstrap = 1;
  context->mm_frees_bootstrap = 1;
  context->mm_kernel_pool = NULL;
  context->mm_recursion_depth = 0;
  for (i = 0; i < MM_SLAB_SIZE / MM_ALLOC_ALIGN; i++) {
    context->mm_prealloc_flags[i] = 0;
  }
  context->mm_prealloc_needed = 0;
  context->mm_busy_freeing = 0;
  context->mm_defer_frees = NULL;
  context->mm_num_defer_frees = 0;

  context->mm_region_tree = NULL;
  context->mm_used_rids = NULL;
  context->mm_free_rids = NULL;
  context->mm_used_ranges = NULL;
  context->mm_free_ranges = NULL;
  context->mm_free_num_slabs = 0;
  context->mm_free_num_rids = 0;
  context->mm_local_rids = NULL;
  context->mm_range_chunk = MM_ADR_RANGE_CHUNK_MAX;
  context->mm_last_harvest = 0;
  context->mm_load_rrobin = 0;
  context->mm_current_load = 0;
  context->mm_reported_load = 0;
  context->mm_children_load = NULL;

  context->pr_core_id = -1;
  context->pr_num_cores = -1;
  context->pr_role = -1;
  context->pr_core_bid_cid = NULL;
  context->pr_core_work_ids = NULL;
  context->pr_core_sched_ids = NULL;
  context->pr_core_child_idx = NULL;
  context->pr_work_core_ids = NULL;
  context->pr_sched_core_ids = NULL;
  context->pr_core_route = NULL;
  context->pr_num_schedulers = -1;
  context->pr_num_workers = -1;
  context->pr_worker_id = -1;
  context->pr_scheduler_id = -1;
  context->pr_parent_sched_id = -1;
  context->pr_scheduler_level = -1;
  context->pr_children = NULL;
  context->pr_num_children = -1;
  context->pr_cur_epoch = 0;
  context->pr_task_table = NULL;
  context->pr_ready_queue = NULL;
  context->pr_spawn_pending = 0;
  context->pr_tasks = NULL;
  context->pr_avail_task_id = 1;
  context->pr_sched_rr = 0;
  context->pr_load_vs_locality = PR_LOAD_VS_LOCALITY_DEFAULT;
  context->pr_cur_sched_load = 0;
  context->pr_cur_run_load = 0;
  context->pr_rep_sched_load = 0;
  context->pr_rep_run_load = 0;
  context->pr_chld_sched_load = NULL;
  context->pr_chld_run_load = NULL;
  context->pr_main_finished = 0;

  context->pr_message_id = 1;
  context->pr_pending_events = NULL;
  context->pr_incomplete_req = NULL;
  context->pr_pages_msg_id = 0;
  context->pr_rids_msg_id = 0;

  context->vid_demo_in_bid = -1;
  context->vid_demo_in_cid = -1;
  context->vid_demo_out_bid = -1;
  context->vid_demo_out_cid = -1;

#if 0
  context->sys_sched_sec = 0;
  context->sys_sched_usec = 0;
  context->sys_sched_calls = 0;
  context->sys_worker_sec = 0;
  context->sys_worker_usec = 0;
  context->sys_worker_sent = 0;
  context->sys_worker_recv = 0;
  context->sys_barrier_sec = 0;
  context->sys_barrier_usec = 0;
#endif

#ifdef ARCH_ARM
  context->fs_flash_num_sectors = 0;
  context->fs_num_blocks = 0;
  context->fs_max_inodes = 0;
  context->fs_state = NULL;
  context->fs_fdesc = NULL;
#endif

#ifdef FMPI
  context->fmpi = NULL;
#endif


  // Context is at a predefined location. Make sure the bootstrapping structs
  // know about it, so the space can be accounted for later. This is the 
  // first malloc we ever call, so no other object can take this predefined
  // location (which mm_get_context() has already returned above).
  ar_assert(kt_malloc(sizeof(Context)) == context);

  // Bootstrap memory system and create kernel memory pool
  ar_assert((MM_ALLOC_ALIGN & (MM_ALLOC_ALIGN - 1)) == 0);
  ar_assert(MM_PAGE_SIZE % MM_SLAB_SIZE == 0);
  ar_assert(MM_SLAB_SIZE % MM_ALLOC_ALIGN == 0);
  ar_assert(kernel_base % MM_SLAB_SIZE == 0);
  ar_assert(kernel_end % MM_SLAB_SIZE == 0);

  // Create kernel pool. The function will handle the rest of the
  // bootstrapping process, end the bootstrap mode and fill
  // context->mm_kernel_pool with the kernel pool.
  mm_slab_create_pool(NULL, kernel_base, MM_KERNEL_SIZE / MM_SLAB_SIZE, 1);
  ar_assert(!context->mm_alloc_bootstrap);
  ar_assert(!context->mm_frees_bootstrap);
}
Exemplo n.º 10
0
// ===========================================================================
// ===========================================================================
void test_mpi_pipeline(int rank, int num_cores, int packet_size, 
                       int num_packets) {

  volatile int  *buf;
  MPI_Status    st;
  int           i;
  int           j;


  // Sanity checks
  ar_assert(num_cores > 2);

  // Allocate buffer
  buf = kt_malloc(packet_size);

  // Producer
  if (rank == 0) {

    kt_printf("%d: MPI pipeline starts\r\n", rank);

    for (i = 0; i < num_packets; i++) {
      for (j = 0; j < packet_size / 4; j++) {
        buf[j] = rank + i + j;
      }
      MPI_Send(buf, packet_size / 4, MPI_INT, rank + 1, 0, MPI_COMM_WORLD);
    }
  }

  // Middleman
  else if (rank < num_cores - 1) {

    for (i = 0; i < num_packets; i++) {

      MPI_Recv(buf, packet_size / 4, MPI_INT, rank - 1, 0, MPI_COMM_WORLD, &st);

      for (j = 0; j < packet_size / 4; j++) {
        buf[j] += rank;
      }

      MPI_Send(buf, packet_size / 4, MPI_INT, rank + 1, 0, MPI_COMM_WORLD);
    }
  }

  // Consumer
  else {

    for (i = 0; i < num_packets; i++) {

      MPI_Recv(buf, packet_size / 4, MPI_INT, rank - 1, 0, MPI_COMM_WORLD, &st);

      for (j = 0; j < packet_size / 4; j++) {

        if (buf[j] != (num_cores - 1) * (num_cores - 2) / 2 + i + j) {
          kt_printf("%d: rep %d, buf[%d] = %d [ %s ]\r\n", 
                    rank, i, j, buf[j], TEST_FAIL);
          while (1) {
            ;
          }
        }
      }
    }
    kt_printf("\n%d: Received %d * %d bytes [ %s ]\r\n", 
              rank, num_packets, packet_size, TEST_PASS);
  }

  // Free buffer
  kt_free((void *) buf);


}
Exemplo n.º 11
0
Arquivo: genarr.c Projeto: spchamp/ilu
PRIVATE char * arrShortName(Type t) {
    t = ar_assert(t);
    return easyShortTypeNameCleaned(t);
}
Exemplo n.º 12
0
Arquivo: genarr.c Projeto: spchamp/ilu
PRIVATE char * arrJName(Type t) {
    t = ar_assert(t);
    return packageDotStringJ(getContextT(t), arrShortName(t));
}
Exemplo n.º 13
0
// ===========================================================================
// kt_realloc()                 Reallocates an object to a new size
// ===========================================================================
// * INPUTS
//   void *old_ptr              The old object to be reallocated
//   size_t new_size            Number of bytes for the new object
//
// * RETURN VALUE
//   void *                     Pointer to new allocated object. Note that
//                              NULL must not be returned -- out of memory
//                              in kernel space should trigger an abort.
// ===========================================================================
void *kt_realloc(void *old_ptr, size_t new_size) {
  
  int           old_size;
  void          *new_ptr;
  Context       *context;


  // Query old pointer. We accept NULL pointers in realloc.
  context = mm_get_context(ar_get_core_id());
  if (old_ptr) {
    old_size = mm_slab_query_pointer(context->mm_kernel_pool, (size_t) old_ptr);
    ar_assert(old_size > 0);
  }
  else {
    old_size = 0;
  }

  // Clamp requests up to 2-GB size (see kt_malloc above).
  ar_assert (new_size < (1 << 31));

  // Align size request to nearest allowed size
  if (new_size & (MM_ALLOC_ALIGN - 1)) {
    new_size = (new_size & ~(MM_ALLOC_ALIGN - 1)) + MM_ALLOC_ALIGN;
  }

  // It may happen that the same size is requested, e.g. by asking for 
  // 4, then 8, then 12, ..., but alignment forces all of them to be 64 bytes.
  // If so, do nothing.
  if (old_size == new_size) {
    return old_ptr;
  }

  // Underlying slab allocator will never* return the same pointer for a
  // different slot size. Allocate a new pointer before freeing the old,
  // so that we avoid Tries or other allocations taking its position before
  // we manage to copy its contents.
  //
  // *: correct is "almost never". Definitely true for < slab_size requests 
  //    where not enough empty slabs are preallocated. For other cases, it can
  //    happen that if we freed the pointer before the new malloc, there is a
  //    chance that will happen. For kernel objects, which are small and
  //    dominated by mallocs, not frees, it is nearly impossible as a case. We
  //    choose not to follow it, because it can seriously mess stuff up if an
  //    intermediate malloc (e.g. a Trie kt_malloc) takes up this space and
  //    overwrites the data before we finish here -- and there's no easy way
  //    of knowing that.
  if (new_size) {
    ar_assert(new_ptr = kt_malloc(new_size));
  }
  else {
    new_ptr = NULL;
  }

  // Copy contents
  if (new_ptr && old_size) {
    kt_memcpy(new_ptr, old_ptr, (old_size < new_size) ? old_size : new_size);
  }

  // Free old pointer
  if (old_ptr) {
    kt_free(old_ptr);
  }

  // Success
  return new_ptr;
}
Exemplo n.º 14
0
// ===========================================================================
// ===========================================================================
int stream_bare(int n) {
    float               *a, *b, *c;

    int                 avgtime[4] = {0}, maxtime[4] = {0},
                        mintime[4] = {0x3FFFFFFF, 0x3FFFFFFF,
                                      0x3FFFFFFF, 0x3FFFFFFF};

    char                *label[4] = {"Copy:      ", "Scale:     ",
                                     "Add:       ", "Triad:     "};

    float               bytes[4] = {
                          2 * sizeof(float) * n,
                          2 * sizeof(float) * n,
                          3 * sizeof(float) * n,
                          3 * sizeof(float) * n
                        };

    int                 BytesPerWord;
    register int        j, k;
    float               scalar;
    int                 t, times[4][NTIMES];

    
    /* --- SETUP --- determine precision and check timing --- */
    kt_printf(HLINE);
    kt_printf("STREAM version $Revision: 1.1 $\n");
    kt_printf(HLINE);
    
    // Assign arrays into kernel heap space (too big to fit in stack)
    a = (float *) mm_va_kernel_base(ar_get_core_id());
    b = a + n;
    c = b + n;
    ar_assert((unsigned int) ((c + n) - a) < MM_KERNEL_SIZE);

    BytesPerWord = sizeof(float);
    //kt_printf("This system uses %d bytes per SINGLE PRECISION word.\n",
    //    BytesPerWord);

    //kt_printf(HLINE);
    kt_printf("Array size = %d, Offset = 0, total memory req. = %.2f KB.\n",
        n, (3.0 * BytesPerWord * (float) n) / 1024.0);
    //kt_printf("Each test is run %d times, but only\n", NTIMES);
    //kt_printf("the *best* time for each is used.\n");


    //kt_printf(HLINE);

    /* Get initial value for system clock. */
    for (j=0; j<n; j++) {
        a[j] = 1.0;
        b[j] = 2.0;
        c[j] = 0.0;
        }

    //if  ( (quantum = checktick()) >= 1) 
    //    kt_printf("Your clock granularity/precision appears to be "
    //        "%d microseconds.\n", quantum);
    //else {
    //    kt_printf("Your clock granularity appears to be "
    //        "less than one microsecond.\n");
    //    quantum = 1;
    //}

    ar_timer_reset();
    for (j = 0; j < n; j++)
        a[j] = 2.0E0 * a[j];
    t = ar_timer_get_cycles();

    //kt_printf("Each test below will take on the order"
    //    " of %d clock cycles.\n", t  );
    //kt_printf("Increase the size of the arrays if this shows that\n");
    //kt_printf("you are not getting at least 20 clock cycles per test.\n");

    kt_printf(HLINE);

    /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
        {
        ar_timer_reset();

        for (j=0; j<n; j++)
            c[j] = a[j];

        times[0][k] = ar_timer_get_cycles();
        
        ar_timer_reset();

        for (j=0; j<n; j++)
            b[j] = scalar*c[j];

        times[1][k] = ar_timer_get_cycles();
        
        ar_timer_reset();

        for (j=0; j<n; j++)
            c[j] = a[j]+b[j];

        times[2][k] = ar_timer_get_cycles();
        
        ar_timer_reset();

        for (j=0; j<n; j++)
            a[j] = b[j]+scalar*c[j];

        times[3][k] = ar_timer_get_cycles();
        }

    /*  --- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
        {
        for (j=0; j<4; j++)
            {
            avgtime[j] = avgtime[j] + times[j][k];
            mintime[j] = MIN(mintime[j], times[j][k]);
            maxtime[j] = MAX(maxtime[j], times[j][k]);
            }
        }
    
    kt_printf("Function   Rate (B/cc)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
        avgtime[j] = avgtime[j]/(float)(NTIMES-1);

        kt_printf("%s%11.4f  %13d  %11d  %11d\n", label[j],
               (float) bytes[j] / (float) mintime[j],
               avgtime[j],
               mintime[j],
               maxtime[j]);
    }
    kt_printf(HLINE);

    /* --- Check Results --- */
    stream_check_results(a, b, c, n);
    //kt_printf(HLINE);

    return 0;
}
Exemplo n.º 15
0
// ===========================================================================
// mm_distr_pack()              Create multiple packing requests to
//                              appropriate schedulers for packing multiple
//                              objects and regions. 
//
//                              The regions and objects given to this function
//                              are checked one by one as to which scheduler
//                              should be responsible for them. Before any
//                              messages are sent, per-scheduler arrays are
//                              created to gather all these decisions. When
//                              this sorting process is finished, request(s)
//                              to each scheduler are made, as many as needed
//                              depending on the number of regions/objects
//                              that are needed by each scheduler.
//
//                              Because many messages to many schedulers are
//                              sent, an array of new message IDs is retured.
// ===========================================================================
// * INPUTS
//   rid_t *regions             Array of remote region IDs to be packed
//   int *region_options        Array of remote region packing options
//   int num_regions            Number of regions in array
//   void **objects             Array of remote objects to be packed
//   int *object_options        Array of remote object packing options
//   int num_objects            Number of objects in array
//   
// * OUTPUTS
//   int **ret_msg_ids          Array of returned message IDs, one for each
//                              message sent
//   int *ret_num_messages      Number of messages generated
//   void **ret_error_ptr       In case of error, region ID or object that
//                              triggered it
//
// * RETURN VALUE
//   int                        0: success, *ret_num_messages generated
//                              ERR_NO_SUCH_REGION: failure, *ret_error_ptr
//                                 region ID is invalid in the whole system.
//                                 No messages generated.
//                              ERR_OUT_OF_RANGE: failure, *ret_error_ptr
//                                 object is invalid in the whole system.
//                                 No messages generated.
// ===========================================================================
int mm_distr_pack(rid_t *regions, int *region_options, int num_regions, 
                  void **objects, int *object_options, int num_objects, 
                  int **ret_msg_ids, int *ret_num_messages,
                  void **ret_error_ptr) {

  typedef struct {
    rid_t *regions;
    int   *region_options;
    int   num_regions;
    void  **objects;
    int   *object_options;
    int   num_objects;
  } pack_per_sched_type;

  Context               *context;
  PrMsgReq              *req;
  Trie                  *trie;
  pack_per_sched_type   *per_sched;
  int                   sched_id;
  int                   error_status;
  int                   cur_reg;
  int                   cur_obj;
  int                   i;


  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(ret_msg_ids);
  ar_assert(ret_num_messages);
  ar_assert(ret_error_ptr);

  // Initialize return variables
  *ret_msg_ids = NULL;
  *ret_num_messages = 0;
  *ret_error_ptr = NULL;

  // Create a trie to store per-scheduler packing needs. We'll be using IDs
  // from 1 to context->pr_num_schedulers, so the MSB is the log2 of
  // context->pr_num_schedulers. The kt_int_log2() function will return the
  // MSB position correctly, even for non-power-of-2 values.
  ar_assert(trie = kt_alloc_trie(kt_int_log2(context->pr_num_schedulers), 0));


  // For all regions
  for (i = 0; i < num_regions; i++) {

    // Find which scheduler should we ask for this region
    sched_id = mm_distr_region_sched_id(regions[i]);
    if (sched_id == ERR_NO_SUCH_REGION) {

      // No such region exists anywhere in the system. Abort.
      *ret_error_ptr = (void *) regions[i];
      error_status = ERR_NO_SUCH_REGION;
      goto error;
    }
    // It can't be ours, local functions should have picked it up
    ar_assert(sched_id != context->pr_scheduler_id);

    // We use scheduler ID + 1, because tries cannot handle zero keys
    sched_id++;

    // Do we have anything else for this scheduler ID?
    kt_trie_find(trie, sched_id, (void *) &per_sched);

    // Append this region to existing entry
    if (per_sched) {
      per_sched->regions = kt_realloc(per_sched->regions,
                                      (per_sched->num_regions + 1) *
                                                sizeof(rid_t));
      per_sched->region_options = kt_realloc(per_sched->region_options,
                                             (per_sched->num_regions + 1) *
                                                       sizeof(int));
      per_sched->regions[per_sched->num_regions] = regions[i];
      per_sched->region_options[per_sched->num_regions] = region_options[i];
      per_sched->num_regions++;
    }

    // Create new entry for this scheduler
    else {
      per_sched = kt_malloc(sizeof(pack_per_sched_type));
      per_sched->regions = kt_malloc(sizeof(rid_t));
      per_sched->region_options = kt_malloc(sizeof(int));
      per_sched->regions[0] = regions[i];
      per_sched->region_options[0] = region_options[i];
      per_sched->num_regions = 1;
      per_sched->objects = NULL;
      per_sched->object_options = NULL;
      per_sched->num_objects = 0;

      // Insert it into the trie
      ar_assert(!kt_trie_insert(trie, sched_id, per_sched));
    }
  }


  // For all objects
  for (i = 0; i < num_objects; i++) {

    // Find which scheduler should we ask for this object
    sched_id = mm_distr_object_sched_id(objects[i]);
    if (sched_id == ERR_OUT_OF_RANGE) {

      // No such object exists anywhere in the system. Abort.
      *ret_error_ptr = objects[i];
      error_status = ERR_OUT_OF_RANGE;
      goto error;
    }
    // It can't be ours, local functions should have picked it up
    ar_assert(sched_id != context->pr_scheduler_id);

    // We use scheduler ID + 1, because tries cannot handle zero keys
    sched_id++;

    // Do we have anything else for this scheduler ID?
    kt_trie_find(trie, sched_id, (void *) &per_sched);

    // Append this object to existing entry
    if (per_sched) {
      per_sched->objects = kt_realloc(per_sched->objects,
                                      (per_sched->num_objects + 1) *
                                                sizeof(void *));
      per_sched->object_options = kt_realloc(per_sched->object_options,
                                      (per_sched->num_objects + 1) *
                                                sizeof(int));
      per_sched->objects[per_sched->num_objects] = objects[i];
      per_sched->object_options[per_sched->num_objects] = object_options[i];
      per_sched->num_objects++;
    }

    // Create new entry for this scheduler
    else {
      per_sched = kt_malloc(sizeof(pack_per_sched_type));
      per_sched->objects = kt_malloc(sizeof(void *));
      per_sched->object_options = kt_malloc(sizeof(int));
      per_sched->objects[0] = objects[i];
      per_sched->object_options[0] = object_options[i];
      per_sched->num_objects = 1;
      per_sched->regions = NULL;
      per_sched->region_options = NULL;
      per_sched->num_regions = 0;

      // Insert it into the trie
      ar_assert(!kt_trie_insert(trie, sched_id, per_sched));
    }
  }


  // For all schedulers that we need to communicate with
  for (sched_id = kt_trie_find_minmax(trie, 0, (void *) &per_sched);
       sched_id;
       sched_id = kt_trie_find_next(trie, 1, (void *) &per_sched)) {

    // Restore correct scheduler ID value
    sched_id--;

    // Initialize
    cur_reg = 0;
    cur_obj = 0;
    req = NULL;

    // Loop until all regions and objects have been sent
    while ((cur_reg < per_sched->num_regions) || 
           (cur_obj < per_sched->num_objects)) {

      // Start building new message
      req = noc_msg_send_get_buf(pr_scheduler_core_id(sched_id));

      req->core_id = context->pr_core_id;
      req->req_id  = context->pr_message_id;
      req->size    = 0;

      // Append this message ID to the array of IDs we'll return
      ar_assert(*ret_msg_ids = kt_realloc(*ret_msg_ids, 
                                            (*ret_num_messages + 1) * 
                                            sizeof(int)));
      (*ret_msg_ids)[*ret_num_messages] = req->req_id;
      (*ret_num_messages)++;

      // Increase message ID, avoiding value 0 on wrap-arounds
      context->pr_message_id = pr_advance_msg_id(context->pr_message_id);

      // Embed first remaining region and first remaining object to basic
      // request
      if (cur_reg < per_sched->num_regions) {
        ar_assert(per_sched->regions[cur_reg]);
        req->region = per_sched->regions[cur_reg];
        req->size |= per_sched->region_options[cur_reg] << 0;
        cur_reg++;
      }
      else {
        req->region = 0;
      }
      if (cur_obj < per_sched->num_objects) {
        ar_assert(per_sched->objects[cur_obj]);
        req->ptr = per_sched->objects[cur_obj];
        req->size |= per_sched->object_options[cur_obj] << MM_PACK_OPTION_BITS;
        cur_obj++;
      }
      else {
        req->ptr = NULL;
      }

      // Did we fit or do we need an extended request?
      if ((cur_reg >= per_sched->num_regions) && 
          (cur_obj >= per_sched->num_objects)) {
        req->type = REQ_PACK;

        // Send basic request only to scheduler
        ar_assert(!noc_msg_send());
        req = NULL;
      }
      else {
        req->type = EXT_REQ_PACK;

        // Put regions in first slots of extended array
        for (req->num_regions = 0;
             (req->num_regions < PR_REQ_MAX_SIZE) && 
             (cur_reg < per_sched->num_regions);
             req->num_regions++, cur_reg++) {
          ar_assert(per_sched->regions[cur_reg]);
          req->data[req->num_regions] = (void *) per_sched->regions[cur_reg];
          req->size |= per_sched->region_options[cur_reg] <<
                              ((2 + req->num_regions) * MM_PACK_OPTION_BITS);
        }

        // Put objects on following slots, if regions are finished
        for (req->num_ptrs = 0;
             (req->num_regions + req->num_ptrs < PR_REQ_MAX_SIZE) &&
             (cur_obj < per_sched->num_objects);
             req->num_ptrs++, cur_obj++) {
          ar_assert(per_sched->objects[cur_obj]);
          req->data[req->num_regions + req->num_ptrs] = 
                                                  per_sched->objects[cur_obj];
          req->size |= per_sched->object_options[cur_obj] <<
                ((2 + req->num_regions + req->num_ptrs) * MM_PACK_OPTION_BITS);
        }

        // Send extended request to scheduler
        ar_assert(!noc_msg_send());
        req = NULL;
      }
    }

    // Free this entry
    kt_free(per_sched->regions);
    kt_free(per_sched->region_options);
    kt_free(per_sched->objects);
    kt_free(per_sched->object_options);
    kt_free(per_sched);
  }

  // Free the trie
  kt_free_trie(trie, NULL);

  // Success
  return 0;


error:

  // Free all allocated per-scheduler entries
  for (sched_id = kt_trie_find_minmax(trie, 0, (void *) &per_sched);
       sched_id;
       sched_id = kt_trie_find_next(trie, 1, (void *) &per_sched)) {
    kt_free(per_sched->regions);
    kt_free(per_sched->region_options);
    kt_free(per_sched->objects);
    kt_free(per_sched->object_options);
    kt_free(per_sched);
  }

  // Free the trie
  kt_free_trie(trie, NULL);

  // Return error code; error_ptr is set above.
  return error_status;

}
Exemplo n.º 16
0
// ===========================================================================
// mm_distr_ralloc_orphan()     Requests a new region creation under a 
//                              remote parent region from the appropriate
//                              scheduler, and tells him that it's an orphan
//                              for him, i.e. with a remote region ID.
// ===========================================================================
// * INPUTS
//   rid_t parent               Region ID of the remote parent region
//   int src_core_id            Core ID of the scheduler that did the request,
//                              so we can take it into account when deciding
//                              which scheduler should own the new orphan
//
// * RETURN VALUE
//   int                        Message ID of the new request
// ===========================================================================
int mm_distr_ralloc_orphan(rid_t parent, int src_core_id) {

  Context               *context;
  int                   src_sched_id;
  MmRgnTreeNode         *parent_node;
  int                   child_id;
  PrMsgReq              *new_req;
  int                   id;
  int                   i;


  // Sanity checks
  context = mm_get_context(ar_get_core_id());
  ar_assert(parent);
  ar_assert(context->pr_scheduler_level);
  ar_assert(context->pr_num_children);
  src_sched_id = pr_core_scheduler_id(src_core_id);
  ar_assert((src_sched_id >= 0) && 
            (src_sched_id < context->pr_num_schedulers));

  // The parent region should belong to us. Find the region node.
  ar_assert(kt_trie_find(context->mm_local_rids, parent, 
                         (void *) &parent_node));


  // In the stand-alone allocator version, we only load-balance if requests
  // come from "above" (i.e. the request from a worker reaches the top-level
  // scheduler and starts to descend). In the Myrmics version, there is 
  // no such case: a task is allowed to see only subtrees of objects/regions
  // it owns, so requests always come from "below". Thus, we always
  // load-balance.

#if 0
  // Request comes from one of our children
  if (src_sched_id != context->pr_parent_sched_id) {
    // Verify he's one of our children
    child_id = -1;
    for (i = 0; i < context->pr_num_children; i++) {
      if (context->pr_children[i] == src_core_id) {
        child_id = i;
        break;
      }
    }
    ar_assert(child_id > -1);
  }
  
  // Request comes from our parent
  else {
#endif  
    // Search for least load, favoring the lastly decided round robin
    // position, in case there's a draw
    child_id = context->mm_load_rrobin;

    for (i = 0; i < context->pr_num_children; i++) {
      if (context->mm_children_load[i] < context->mm_children_load[child_id]) {
        child_id = i;
      }
    }

    // New round robin position is the next child from the one we decided
    context->mm_load_rrobin = child_id + 1;
    if (context->mm_load_rrobin >= context->pr_num_children) {
      context->mm_load_rrobin = 0;
    }
#if 0
  }
#endif

  // Build message
  new_req = noc_msg_send_get_buf(context->pr_children[child_id]);

  new_req->core_id = context->pr_core_id;
  new_req->req_id  = context->pr_message_id;
  new_req->type    = REQ_RALLOC_ORPHAN;
  new_req->region  = parent;
  new_req->size    = parent_node->location;
  
  // Send message to the selected scheduler
  ar_assert(!noc_msg_send());

  // Increase message ID
  id = context->pr_message_id;
  context->pr_message_id = pr_advance_msg_id(context->pr_message_id);

  // Success
  return id;
}
Exemplo n.º 17
0
// ===========================================================================
// function()                   FIXME comments
// ===========================================================================
// * INPUTS
//   unsigned char *arg1        Describe arg1
//   int arg2                   Describe arg2
//
// * OUTPUTS
//   int *arg3                  Describe arg3
//
// * RETURN VALUE
//   int                        0 for success
// ===========================================================================
void dbg_stats_report(char *filename) {
#ifdef DBG_STATS_ENABLED

#define SUMMARY_BUF_SIZE 8196

  Context       *context;
  int           my_bid;
  int           my_cid;
  int           bid;
  int           cid;
  unsigned int  **stats = NULL;
  char          *buf = NULL;
  char          fmt_buf[16];
  char          *s;
  unsigned int  i;
  unsigned int  j;

  unsigned int  sch_idle_tot = 0;
  unsigned int  sch_idle_avg = 0;
  int           sch_idle_avg_dec = 0;
  int           sch_idle_avg_frac = 0;
  char          sch_idle_avg_units = 0;
  unsigned int  sch_idle_min = UINT_MAX;
  int           sch_idle_min_idx = -1;
  int           sch_idle_min_dec = 0;
  int           sch_idle_min_frac = 0;
  char          sch_idle_min_units = ' ';
  unsigned int  sch_idle_max = 0;
  int           sch_idle_max_idx = -1;
  int           sch_idle_max_dec = 0;
  int           sch_idle_max_frac = 0;
  char          sch_idle_max_units = 0;

  unsigned int  wrk_idle_tot = 0;
  unsigned int  wrk_idle_avg = 0;
  int           wrk_idle_avg_dec = 0;
  int           wrk_idle_avg_frac = 0;
  char          wrk_idle_avg_units = 0;
  unsigned int  wrk_idle_min = UINT_MAX;
  int           wrk_idle_min_idx = -1;
  int           wrk_idle_min_dec = 0;
  int           wrk_idle_min_frac = 0;
  char          wrk_idle_min_units = ' ';
  unsigned int  wrk_idle_max = 0;
  int           wrk_idle_max_idx = -1;
  int           wrk_idle_max_dec = 0;
  int           wrk_idle_max_frac = 0;
  char          wrk_idle_max_units = 0;

  unsigned int  sch_mem_tot = 0;
  unsigned int  sch_mem_avg = 0;
  int           sch_mem_avg_dec = 0;
  int           sch_mem_avg_frac = 0;
  char          sch_mem_avg_units = 0;
  unsigned int  sch_mem_min = UINT_MAX;
  int           sch_mem_min_idx = -1;
  int           sch_mem_min_dec = 0;
  int           sch_mem_min_frac = 0;
  char          sch_mem_min_units = ' ';
  unsigned int  sch_mem_max = 0;
  int           sch_mem_max_idx = -1;
  int           sch_mem_max_dec = 0;
  int           sch_mem_max_frac = 0;
  char          sch_mem_max_units = 0;

  unsigned int  wrk_work_tot = 0;
  unsigned int  wrk_work_avg = 0;
  int           wrk_work_avg_dec = 0;
  int           wrk_work_avg_frac = 0;
  char          wrk_work_avg_units = 0;
  unsigned int  wrk_work_min = UINT_MAX;
  int           wrk_work_min_idx = -1;
  int           wrk_work_min_dec = 0;
  int           wrk_work_min_frac = 0;
  char          wrk_work_min_units = ' ';
  unsigned int  wrk_work_max = 0;
  int           wrk_work_max_idx = -1;
  int           wrk_work_max_dec = 0;
  int           wrk_work_max_frac = 0;
  char          wrk_work_max_units = 0;

  unsigned int  sch_sched_tot = 0;
  unsigned int  sch_sched_avg = 0;
  int           sch_sched_avg_dec = 0;
  int           sch_sched_avg_frac = 0;
  char          sch_sched_avg_units = 0;
  unsigned int  sch_sched_min = UINT_MAX;
  int           sch_sched_min_idx = -1;
  int           sch_sched_min_dec = 0;
  int           sch_sched_min_frac = 0;
  char          sch_sched_min_units = ' ';
  unsigned int  sch_sched_max = 0;
  int           sch_sched_max_idx = -1;
  int           sch_sched_max_dec = 0;
  int           sch_sched_max_frac = 0;
  char          sch_sched_max_units = 0;

  unsigned int  wrk_wait_tot = 0;
  unsigned int  wrk_wait_avg = 0;
  int           wrk_wait_avg_dec = 0;
  int           wrk_wait_avg_frac = 0;
  char          wrk_wait_avg_units = 0;
  unsigned int  wrk_wait_min = UINT_MAX;
  int           wrk_wait_min_idx = -1;
  int           wrk_wait_min_dec = 0;
  int           wrk_wait_min_frac = 0;
  char          wrk_wait_min_units = ' ';
  unsigned int  wrk_wait_max = 0;
  int           wrk_wait_max_idx = -1;
  int           wrk_wait_max_dec = 0;
  int           wrk_wait_max_frac = 0;
  char          wrk_wait_max_units = 0;

  unsigned int  sch_tasks_tot = 0;
  unsigned int  sch_tasks_avg = 0;
  int           sch_tasks_avg_dec = 0;
  int           sch_tasks_avg_frac = 0;
  char          sch_tasks_avg_units = 0;
  unsigned int  sch_tasks_min = UINT_MAX;
  int           sch_tasks_min_idx = -1;
  int           sch_tasks_min_dec = 0;
  int           sch_tasks_min_frac = 0;
  char          sch_tasks_min_units = ' ';
  unsigned int  sch_tasks_max = 0;
  int           sch_tasks_max_idx = -1;
  int           sch_tasks_max_dec = 0;
  int           sch_tasks_max_frac = 0;
  char          sch_tasks_max_units = 0;

  unsigned int  wrk_tasks_tot = 0;
  unsigned int  wrk_tasks_avg = 0;
  int           wrk_tasks_avg_dec = 0;
  int           wrk_tasks_avg_frac = 0;
  char          wrk_tasks_avg_units = 0;
  unsigned int  wrk_tasks_min = UINT_MAX;
  int           wrk_tasks_min_idx = -1;
  int           wrk_tasks_min_dec = 0;
  int           wrk_tasks_min_frac = 0;
  char          wrk_tasks_min_units = ' ';
  unsigned int  wrk_tasks_max = 0;
  int           wrk_tasks_max_idx = -1;
  int           wrk_tasks_max_dec = 0;
  int           wrk_tasks_max_frac = 0;
  char          wrk_tasks_max_units = 0;

  unsigned int  sch_msg_tot = 0;
  unsigned int  sch_msg_avg = 0;
  int           sch_msg_avg_dec = 0;
  int           sch_msg_avg_frac = 0;
  char          sch_msg_avg_units = 0;
  unsigned int  sch_msg_min = UINT_MAX;
  int           sch_msg_min_idx = -1;
  int           sch_msg_min_dec = 0;
  int           sch_msg_min_frac = 0;
  char          sch_msg_min_units = ' ';
  unsigned int  sch_msg_max = 0;
  int           sch_msg_max_idx = -1;
  int           sch_msg_max_dec = 0;
  int           sch_msg_max_frac = 0;
  char          sch_msg_max_units = 0;

  unsigned int  wrk_msg_tot = 0;
  unsigned int  wrk_msg_avg = 0;
  int           wrk_msg_avg_dec = 0;
  int           wrk_msg_avg_frac = 0;
  char          wrk_msg_avg_units = 0;
  unsigned int  wrk_msg_min = UINT_MAX;
  int           wrk_msg_min_idx = -1;
  int           wrk_msg_min_dec = 0;
  int           wrk_msg_min_frac = 0;
  char          wrk_msg_min_units = ' ';
  unsigned int  wrk_msg_max = 0;
  int           wrk_msg_max_idx = -1;
  int           wrk_msg_max_dec = 0;
  int           wrk_msg_max_frac = 0;
  char          wrk_msg_max_units = 0;

  unsigned int  wrk_ndma_tot = 0;
  unsigned int  wrk_ndma_avg = 0;
  int           wrk_ndma_avg_dec = 0;
  int           wrk_ndma_avg_frac = 0;
  char          wrk_ndma_avg_units = 0;
  unsigned int  wrk_ndma_min = UINT_MAX;
  int           wrk_ndma_min_idx = -1;
  int           wrk_ndma_min_dec = 0;
  int           wrk_ndma_min_frac = 0;
  char          wrk_ndma_min_units = ' ';
  unsigned int  wrk_ndma_max = 0;
  int           wrk_ndma_max_idx = -1;
  int           wrk_ndma_max_dec = 0;
  int           wrk_ndma_max_frac = 0;
  char          wrk_ndma_max_units = 0;

  unsigned int  wrk_sdma_tot = 0;
  unsigned int  wrk_sdma_avg = 0;
  int           wrk_sdma_avg_dec = 0;
  int           wrk_sdma_avg_frac = 0;
  char          wrk_sdma_avg_units = 0;
  unsigned int  wrk_sdma_min = UINT_MAX;
  int           wrk_sdma_min_idx = -1;
  int           wrk_sdma_min_dec = 0;
  int           wrk_sdma_min_frac = 0;
  char          wrk_sdma_min_units = ' ';
  unsigned int  wrk_sdma_max = 0;
  int           wrk_sdma_max_idx = -1;
  int           wrk_sdma_max_dec = 0;
  int           wrk_sdma_max_frac = 0;
  char          wrk_sdma_max_units = 0;


  // Get context
  context = mm_get_context(ar_get_core_id());
  my_cid = ar_get_core_id();
  my_bid = ar_get_board_id();


  // =========================================================================
  // Gather stats from everybody
  // =========================================================================

  // Top-level scheduler is the master core for reporting
  if (context->pr_parent_sched_id == -1) {

    // Allocate space for all
    stats = kt_malloc(context->pr_num_cores * sizeof(unsigned int *));
    for (i = 0; i < context->pr_num_cores; i++) {
      stats[i] = kt_malloc(DBG_STATS_NUM_STATS * sizeof(unsigned int));
    }
    
    // For all cores in the setup
    for (i = 0; i < context->pr_num_cores; i++) {

      // Get arch-level board/core ID
      pr_core_arch_bid_cid(i, &bid, &cid);

      // Is it us?
      if ((bid == my_bid) && (cid == my_cid)) {

        // Copy our own stats
        for (j = 0; j < DBG_STATS_NUM_STATS; j++) {
          stats[i][j] = context->dbg_stats_data[j];
        }

        continue;
      }

      // Handshake with peer and tell him to send us his stats; send our
      // bid/cid so he can communicate back
      ar_mbox_send(my_cid, bid, cid, DBG_STATS_MAGIC_HSHAKE1);
      ar_mbox_send(my_cid, bid, cid, (my_bid << 8) | my_cid);
      
      // Sanity check: get start-of-transmission
      ar_assert(ar_mbox_get(my_cid) == DBG_STATS_MAGIC_HSHAKE2);

      // Copy his stats
      for (j = 0; j < DBG_STATS_NUM_STATS; j++) {
        stats[i][j] = ar_mbox_get(my_cid);
      }
      
      // Sanity check: get end-of-transmission
      ar_assert(ar_mbox_get(my_cid) == DBG_STATS_MAGIC_HSHAKE3);
    }
  }

  // Other cores are slaves
  else {

    // Receive master command and his bid/cid
    ar_assert(ar_mbox_get(my_cid) == DBG_STATS_MAGIC_HSHAKE1);
    i = ar_mbox_get(my_cid);
    bid = i >> 8;
    cid = i & 0xFF;

    // Send start-of-transmission
    ar_mbox_send(my_cid, bid, cid, DBG_STATS_MAGIC_HSHAKE2);

    // Send our stats
    for (i = 0; i < DBG_STATS_NUM_STATS; i++) {
      ar_mbox_send(my_cid, bid, cid, context->dbg_stats_data[i]);
    }

    // Send end-of-transmission
    ar_mbox_send(my_cid, bid, cid, DBG_STATS_MAGIC_HSHAKE3);
  }


  // =========================================================================
  // Create a summary and print it
  // =========================================================================
  
  // Top-level scheduler only, everybody else get out
  if (context->pr_parent_sched_id != -1) {
    return;
  }

  // Scheduler & worker idle time
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_sched_ids[i] > -1) {
      sch_idle_tot += stats[i][DBG_STATS_IDX_TIME_IDLE];
      if (stats[i][DBG_STATS_IDX_TIME_IDLE] < sch_idle_min) {
        sch_idle_min = stats[i][DBG_STATS_IDX_TIME_IDLE];
        sch_idle_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_TIME_IDLE] > sch_idle_max) {
        sch_idle_max = stats[i][DBG_STATS_IDX_TIME_IDLE];
        sch_idle_max_idx = i;
      }
    }
    else {
      wrk_idle_tot += stats[i][DBG_STATS_IDX_TIME_IDLE];
      if (stats[i][DBG_STATS_IDX_TIME_IDLE] < wrk_idle_min) {
        wrk_idle_min = stats[i][DBG_STATS_IDX_TIME_IDLE];
        wrk_idle_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_TIME_IDLE] > wrk_idle_max) {
        wrk_idle_max = stats[i][DBG_STATS_IDX_TIME_IDLE];
        wrk_idle_max_idx = i;
      }
    }
  }
  sch_idle_avg = sch_idle_tot / context->pr_num_schedulers;
  wrk_idle_avg = wrk_idle_tot / context->pr_num_workers;
  
  dbg_stats_format(sch_idle_avg, &sch_idle_avg_dec, &sch_idle_avg_frac, 
                   &sch_idle_avg_units);
  dbg_stats_format(sch_idle_min, &sch_idle_min_dec, &sch_idle_min_frac, 
                   &sch_idle_min_units);
  dbg_stats_format(sch_idle_max, &sch_idle_max_dec, &sch_idle_max_frac, 
                   &sch_idle_max_units);

  dbg_stats_format(wrk_idle_avg, &wrk_idle_avg_dec, &wrk_idle_avg_frac, 
                   &wrk_idle_avg_units);
  dbg_stats_format(wrk_idle_min, &wrk_idle_min_dec, &wrk_idle_min_frac, 
                   &wrk_idle_min_units);
  dbg_stats_format(wrk_idle_max, &wrk_idle_max_dec, &wrk_idle_max_frac, 
                   &wrk_idle_max_units);

  // Scheduler mem time
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_sched_ids[i] > -1) {
      sch_mem_tot += stats[i][DBG_STATS_IDX_TIME_MEM_SERVE];
      if (stats[i][DBG_STATS_IDX_TIME_MEM_SERVE] < sch_mem_min) {
        sch_mem_min = stats[i][DBG_STATS_IDX_TIME_MEM_SERVE];
        sch_mem_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_TIME_MEM_SERVE] > sch_mem_max) {
        sch_mem_max = stats[i][DBG_STATS_IDX_TIME_MEM_SERVE];
        sch_mem_max_idx = i;
      }
    }
  }
  sch_mem_avg = sch_mem_tot / context->pr_num_schedulers;
  
  dbg_stats_format(sch_mem_avg, &sch_mem_avg_dec, &sch_mem_avg_frac, 
                   &sch_mem_avg_units);
  dbg_stats_format(sch_mem_min, &sch_mem_min_dec, &sch_mem_min_frac, 
                   &sch_mem_min_units);
  dbg_stats_format(sch_mem_max, &sch_mem_max_dec, &sch_mem_max_frac, 
                   &sch_mem_max_units);

  // Worker work time
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_work_ids[i] > -1) {
      wrk_work_tot += stats[i][DBG_STATS_IDX_TIME_TASK_EXEC];
      if (stats[i][DBG_STATS_IDX_TIME_TASK_EXEC] < wrk_work_min) {
        wrk_work_min = stats[i][DBG_STATS_IDX_TIME_TASK_EXEC];
        wrk_work_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_TIME_TASK_EXEC] > wrk_work_max) {
        wrk_work_max = stats[i][DBG_STATS_IDX_TIME_TASK_EXEC];
        wrk_work_max_idx = i;
      }
    }
  }
  wrk_work_avg = wrk_work_tot / context->pr_num_workers;
  
  dbg_stats_format(wrk_work_avg, &wrk_work_avg_dec, &wrk_work_avg_frac, 
                   &wrk_work_avg_units);
  dbg_stats_format(wrk_work_min, &wrk_work_min_dec, &wrk_work_min_frac, 
                   &wrk_work_min_units);
  dbg_stats_format(wrk_work_max, &wrk_work_max_dec, &wrk_work_max_frac, 
                   &wrk_work_max_units);

  // Scheduler non-mem time
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_sched_ids[i] > -1) {
      sch_sched_tot += stats[i][DBG_STATS_IDX_TIME_SCH_SERVE];
      if (stats[i][DBG_STATS_IDX_TIME_SCH_SERVE] < sch_sched_min) {
        sch_sched_min = stats[i][DBG_STATS_IDX_TIME_SCH_SERVE];
        sch_sched_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_TIME_SCH_SERVE] > sch_sched_max) {
        sch_sched_max = stats[i][DBG_STATS_IDX_TIME_SCH_SERVE];
        sch_sched_max_idx = i;
      }
    }
  }
  sch_sched_avg = sch_sched_tot / context->pr_num_schedulers;
  
  dbg_stats_format(sch_sched_avg, &sch_sched_avg_dec, &sch_sched_avg_frac, 
                   &sch_sched_avg_units);
  dbg_stats_format(sch_sched_min, &sch_sched_min_dec, &sch_sched_min_frac, 
                   &sch_sched_min_units);
  dbg_stats_format(sch_sched_max, &sch_sched_max_dec, &sch_sched_max_frac, 
                   &sch_sched_max_units);

  // Worker wait time
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_work_ids[i] > -1) {
      wrk_wait_tot += stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT];
      if (stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT] < wrk_wait_min) {
        wrk_wait_min = stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT];
        wrk_wait_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT] > wrk_wait_max) {
        wrk_wait_max = stats[i][DBG_STATS_IDX_TIME_WORKER_WAIT];
        wrk_wait_max_idx = i;
      }
    }
  }
  wrk_wait_avg = wrk_wait_tot / context->pr_num_workers;
  
  dbg_stats_format(wrk_wait_avg, &wrk_wait_avg_dec, &wrk_wait_avg_frac, 
                   &wrk_wait_avg_units);
  dbg_stats_format(wrk_wait_min, &wrk_wait_min_dec, &wrk_wait_min_frac, 
                   &wrk_wait_min_units);
  dbg_stats_format(wrk_wait_max, &wrk_wait_max_dec, &wrk_wait_max_frac, 
                   &wrk_wait_max_units);

  // Scheduler & worker local tasks
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_sched_ids[i] > -1) {
      sch_tasks_tot += stats[i][DBG_STATS_IDX_NUM_TASKS];
      if (stats[i][DBG_STATS_IDX_NUM_TASKS] < sch_tasks_min) {
        sch_tasks_min = stats[i][DBG_STATS_IDX_NUM_TASKS];
        sch_tasks_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_NUM_TASKS] > sch_tasks_max) {
        sch_tasks_max = stats[i][DBG_STATS_IDX_NUM_TASKS];
        sch_tasks_max_idx = i;
      }
    }
    else {
      wrk_tasks_tot += stats[i][DBG_STATS_IDX_NUM_TASKS];
      if (stats[i][DBG_STATS_IDX_NUM_TASKS] < wrk_tasks_min) {
        wrk_tasks_min = stats[i][DBG_STATS_IDX_NUM_TASKS];
        wrk_tasks_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_NUM_TASKS] > wrk_tasks_max) {
        wrk_tasks_max = stats[i][DBG_STATS_IDX_NUM_TASKS];
        wrk_tasks_max_idx = i;
      }
    }
  }
  sch_tasks_avg = sch_tasks_tot / context->pr_num_schedulers;
  wrk_tasks_avg = wrk_tasks_tot / context->pr_num_workers;
  
  dbg_stats_format(sch_tasks_avg, &sch_tasks_avg_dec, &sch_tasks_avg_frac, 
                   &sch_tasks_avg_units);
  dbg_stats_format(sch_tasks_min, &sch_tasks_min_dec, &sch_tasks_min_frac, 
                   &sch_tasks_min_units);
  dbg_stats_format(sch_tasks_max, &sch_tasks_max_dec, &sch_tasks_max_frac, 
                   &sch_tasks_max_units);

  dbg_stats_format(wrk_tasks_avg, &wrk_tasks_avg_dec, &wrk_tasks_avg_frac, 
                   &wrk_tasks_avg_units);
  dbg_stats_format(wrk_tasks_min, &wrk_tasks_min_dec, &wrk_tasks_min_frac, 
                   &wrk_tasks_min_units);
  dbg_stats_format(wrk_tasks_max, &wrk_tasks_max_dec, &wrk_tasks_max_frac, 
                   &wrk_tasks_max_units);

  // Scheduler & worker number of messages
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_sched_ids[i] > -1) {
      sch_msg_tot += stats[i][DBG_STATS_IDX_NUM_MESSAGES];
      if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] < sch_msg_min) {
        sch_msg_min = stats[i][DBG_STATS_IDX_NUM_MESSAGES];
        sch_msg_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] > sch_msg_max) {
        sch_msg_max = stats[i][DBG_STATS_IDX_NUM_MESSAGES];
        sch_msg_max_idx = i;
      }
    }
    else {
      wrk_msg_tot += stats[i][DBG_STATS_IDX_NUM_MESSAGES];
      if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] < wrk_msg_min) {
        wrk_msg_min = stats[i][DBG_STATS_IDX_NUM_MESSAGES];
        wrk_msg_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_NUM_MESSAGES] > wrk_msg_max) {
        wrk_msg_max = stats[i][DBG_STATS_IDX_NUM_MESSAGES];
        wrk_msg_max_idx = i;
      }
    }
  }
  sch_msg_avg = sch_msg_tot / context->pr_num_schedulers;
  wrk_msg_avg = wrk_msg_tot / context->pr_num_workers;
  
  dbg_stats_format(sch_msg_avg, &sch_msg_avg_dec, &sch_msg_avg_frac, 
                   &sch_msg_avg_units);
  dbg_stats_format(sch_msg_min, &sch_msg_min_dec, &sch_msg_min_frac, 
                   &sch_msg_min_units);
  dbg_stats_format(sch_msg_max, &sch_msg_max_dec, &sch_msg_max_frac, 
                   &sch_msg_max_units);

  dbg_stats_format(wrk_msg_avg, &wrk_msg_avg_dec, &wrk_msg_avg_frac, 
                   &wrk_msg_avg_units);
  dbg_stats_format(wrk_msg_min, &wrk_msg_min_dec, &wrk_msg_min_frac, 
                   &wrk_msg_min_units);
  dbg_stats_format(wrk_msg_max, &wrk_msg_max_dec, &wrk_msg_max_frac, 
                   &wrk_msg_max_units);

  // Worker number of DMAs
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_work_ids[i] > -1) {
      wrk_ndma_tot += stats[i][DBG_STATS_IDX_NUM_DMAS];
      if (stats[i][DBG_STATS_IDX_NUM_DMAS] < wrk_ndma_min) {
        wrk_ndma_min = stats[i][DBG_STATS_IDX_NUM_DMAS];
        wrk_ndma_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_NUM_DMAS] > wrk_ndma_max) {
        wrk_ndma_max = stats[i][DBG_STATS_IDX_NUM_DMAS];
        wrk_ndma_max_idx = i;
      }
    }
  }
  wrk_ndma_avg = wrk_ndma_tot / context->pr_num_workers;
  
  dbg_stats_format(wrk_ndma_avg, &wrk_ndma_avg_dec, &wrk_ndma_avg_frac, 
                   &wrk_ndma_avg_units);
  dbg_stats_format(wrk_ndma_min, &wrk_ndma_min_dec, &wrk_ndma_min_frac, 
                   &wrk_ndma_min_units);
  dbg_stats_format(wrk_ndma_max, &wrk_ndma_max_dec, &wrk_ndma_max_frac, 
                   &wrk_ndma_max_units);

  // Worker DMA size
  for (i = 0; i < context->pr_num_cores; i++) {
    if (context->pr_core_work_ids[i] > -1) {
      wrk_sdma_tot += stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE];
      if (stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE] < wrk_sdma_min) {
        wrk_sdma_min = stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE];
        wrk_sdma_min_idx = i;
      }
      if (stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE] > wrk_sdma_max) {
        wrk_sdma_max = stats[i][DBG_STATS_IDX_DMA_TOTAL_SIZE];
        wrk_sdma_max_idx = i;
      }
    }
  }
  wrk_sdma_avg = wrk_sdma_tot / context->pr_num_workers;
  
  dbg_stats_format(wrk_sdma_avg, &wrk_sdma_avg_dec, &wrk_sdma_avg_frac, 
                   &wrk_sdma_avg_units);
  dbg_stats_format(wrk_sdma_min, &wrk_sdma_min_dec, &wrk_sdma_min_frac, 
                   &wrk_sdma_min_units);
  dbg_stats_format(wrk_sdma_max, &wrk_sdma_max_dec, &wrk_sdma_max_frac, 
                   &wrk_sdma_max_units);



  // Print summary to buffer
  buf = kt_malloc(SUMMARY_BUF_SIZE * sizeof(char));
  s = buf;
  s += kt_sprintf(s, 
"=============================================================================\r\n"
"                            Statistics Summary\r\n"
"=============================================================================\r\n"
"\r\n"
"             Schedulers                               Workers\r\n"
"          ----------------                         -------------\r\n"
"\r\n"
"Idle time:    %3d.%d %c (avg)             Idle time:    %3d.%d %c (avg)\r\n"
"              %3d.%d %c (min, core %3d)                 %3d.%d %c (min, core %3d)\r\n"
"              %3d.%d %c (max, core %3d)                 %3d.%d %c (max, core %3d)\r\n"
"\r\n"
"Mem time:     %3d.%d %c (avg)             Work time:    %3d.%d %c (avg)\r\n"
"              %3d.%d %c (min, core %3d)                 %3d.%d %c (min, core %3d)\r\n"
"              %3d.%d %c (max, core %3d)                 %3d.%d %c (max, core %3d)\r\n"
"\r\n"
"Non-mem time: %3d.%d %c (avg)             Wait time:    %3d.%d %c (avg)\r\n"
"              %3d.%d %c (min, core %3d)                 %3d.%d %c (min, core %3d)\r\n"
"              %3d.%d %c (max, core %3d)                 %3d.%d %c (max, core %3d)\r\n"
"\r\n"
"Local tasks:  %3d.%d %c (avg)             Local tasks:  %3d.%d %c (avg)\r\n"
"              %3d.%d %c (min, core %3d)                 %3d.%d %c (min, core %3d)\r\n"
"              %3d.%d %c (max, core %3d)                 %3d.%d %c (max, core %3d)\r\n"
"\r\n"
"Num messages: %3d.%d %c (avg)             Num messages: %3d.%d %c (avg)\r\n"
"              %3d.%d %c (min, core %3d)                 %3d.%d %c (min, core %3d)\r\n"
"              %3d.%d %c (max, core %3d)                 %3d.%d %c (max, core %3d)\r\n"
"\r\n"
"                                        Num DMAs:     %3d.%d %c (avg)\r\n"
"                                                      %3d.%d %c (min, core %3d)\r\n"
"                                                      %3d.%d %c (max, core %3d)\r\n"
"\r\n"
"                                        DMAed data:   %3d.%d %c (avg)\r\n"
"                                                      %3d.%d %c (min, core %3d)\r\n"
"                                                      %3d.%d %c (max, core %3d)\r\n"
"=============================================================================\r\n",
    sch_idle_avg_dec, sch_idle_avg_frac, sch_idle_avg_units,
    wrk_idle_avg_dec, wrk_idle_avg_frac, wrk_idle_avg_units,
    sch_idle_min_dec, sch_idle_min_frac, sch_idle_min_units, sch_idle_min_idx,
    wrk_idle_min_dec, wrk_idle_min_frac, wrk_idle_min_units, wrk_idle_min_idx,
    sch_idle_max_dec, sch_idle_max_frac, sch_idle_max_units, sch_idle_max_idx,
    wrk_idle_max_dec, wrk_idle_max_frac, wrk_idle_max_units, wrk_idle_max_idx,

    sch_mem_avg_dec, sch_mem_avg_frac, sch_mem_avg_units,
    wrk_work_avg_dec, wrk_work_avg_frac, wrk_work_avg_units,
    sch_mem_min_dec, sch_mem_min_frac, sch_mem_min_units, sch_mem_min_idx,
    wrk_work_min_dec, wrk_work_min_frac, wrk_work_min_units, wrk_work_min_idx,
    sch_mem_max_dec, sch_mem_max_frac, sch_mem_max_units, sch_mem_max_idx,
    wrk_work_max_dec, wrk_work_max_frac, wrk_work_max_units, wrk_work_max_idx,

    sch_sched_avg_dec, sch_sched_avg_frac, sch_sched_avg_units,
    wrk_wait_avg_dec, wrk_wait_avg_frac, wrk_wait_avg_units,
    sch_sched_min_dec, sch_sched_min_frac, sch_sched_min_units, sch_sched_min_idx,
    wrk_wait_min_dec, wrk_wait_min_frac, wrk_wait_min_units, wrk_wait_min_idx,
    sch_sched_max_dec, sch_sched_max_frac, sch_sched_max_units, sch_sched_max_idx,
    wrk_wait_max_dec, wrk_wait_max_frac, wrk_wait_max_units, wrk_wait_max_idx,

    sch_tasks_avg_dec, sch_tasks_avg_frac, sch_tasks_avg_units,
    wrk_tasks_avg_dec, wrk_tasks_avg_frac, wrk_tasks_avg_units,
    sch_tasks_min_dec, sch_tasks_min_frac, sch_tasks_min_units, sch_tasks_min_idx,
    wrk_tasks_min_dec, wrk_tasks_min_frac, wrk_tasks_min_units, wrk_tasks_min_idx,
    sch_tasks_max_dec, sch_tasks_max_frac, sch_tasks_max_units, sch_tasks_max_idx,
    wrk_tasks_max_dec, wrk_tasks_max_frac, wrk_tasks_max_units, wrk_tasks_max_idx,

    sch_msg_avg_dec, sch_msg_avg_frac, sch_msg_avg_units,
    wrk_msg_avg_dec, wrk_msg_avg_frac, wrk_msg_avg_units,
    sch_msg_min_dec, sch_msg_min_frac, sch_msg_min_units, sch_msg_min_idx,
    wrk_msg_min_dec, wrk_msg_min_frac, wrk_msg_min_units, wrk_msg_min_idx,
    sch_msg_max_dec, sch_msg_max_frac, sch_msg_max_units, sch_msg_max_idx,
    wrk_msg_max_dec, wrk_msg_max_frac, wrk_msg_max_units, wrk_msg_max_idx,

    wrk_ndma_avg_dec, wrk_ndma_avg_frac, wrk_ndma_avg_units,
    wrk_ndma_min_dec, wrk_ndma_min_frac, wrk_ndma_min_units, wrk_ndma_min_idx,
    wrk_ndma_max_dec, wrk_ndma_max_frac, wrk_ndma_max_units, wrk_ndma_max_idx,

    wrk_sdma_avg_dec, wrk_sdma_avg_frac, wrk_sdma_avg_units,
    wrk_sdma_min_dec, wrk_sdma_min_frac, wrk_sdma_min_units, wrk_sdma_min_idx,
    wrk_sdma_max_dec, wrk_sdma_max_frac, wrk_sdma_max_units, wrk_sdma_max_idx

  );
  if (s - buf > SUMMARY_BUF_SIZE) {
    ar_panic("Summary buffer overflow");
  }

  // Print it
  ar_uart_flush();
  ar_timer_busy_wait_msec(200);
  kt_printf("\r\n%s\r\n", buf);


  // =========================================================================
  // Dump both the summary and all the per-core analytical stats to a file
  // =========================================================================
  
  // Print file begin dump header
  ar_uart_flush();
  ar_timer_busy_wait_msec(200);
  kt_printf(DBG_FILE_DUMP_BEGIN_FORMAT, filename);
  ar_uart_flush();
  ar_timer_busy_wait_msec(20);

  // Print the summary
  kt_printf("%s\r\n", buf);

  // For all cores in the setup
  for (i = 0; i < context->pr_num_cores; i++) {
    
    // Print the analytical stats
    kt_printf("==================================================\r\n");
    kt_printf("Analytical Statistics for Core %d [%s]\r\n", 
              i,
              (context->pr_core_sched_ids[i] > -1) ? "Scheduler" : 
                                                     "Worker");
    kt_printf("==================================================\r\n");
    for (j = 0; j < DBG_STATS_NUM_STATS; j++) {
      // Print relevant fields
      if ((context->pr_core_sched_ids[i] > -1) &&
           ((j == 1) || (j == 2) || (j == 7) || (j == 8))) {
        continue;
      }
      dbg_stats_format_number(fmt_buf, stats[i][j]);
      kt_printf("%s %15s\r\n",
          (j == 0) ? "Idle time:    " :
          (j == 1) ? "Work time:    " :
          (j == 2) ? "Wait time:    " :
          (j == 3) ? "Mem time:     " :
          (j == 4) ? "Non-mem time: " :
          (j == 5) ? "Local tasks:  " :
          (j == 6) ? "Num messages: " :
          (j == 7) ? "Num DMAs:     " :
          (j == 8) ? "DMAed data:   " : 
                     "!!!ERROR!!!",
          fmt_buf);
    }
    kt_printf("==================================================\r\n\r\n");
  }

  // Print file end dump header
  ar_uart_flush();
  ar_timer_busy_wait_msec(200);
  kt_printf(DBG_FILE_DUMP_END_FORMAT, filename);
  ar_uart_flush();
  ar_timer_busy_wait_msec(20);

  // Free stuff
  kt_free(stats);
  kt_free(buf);


#endif
}
Exemplo n.º 18
0
// ===========================================================================
// kt_malloc()                  Kernel basic allocation function. Allocates
//                              serially in predefined places (see above) 
//                              during bootstrap, or calls the slab allocator
//                              out of bootstrap.
// ===========================================================================
// * INPUTS
//   size_t size                Number of bytes to be allocated (can be 0)
//
// * RETURN VALUE
//   void *                     Pointer to new allocated object. Note that
//                              NULL will not be returned for a non-zero size
//                              -- out of memory in kernel space will trigger
//                              an abort.
// ===========================================================================
void *kt_malloc(size_t size) {
  
  Context       *context;
  int           my_cid;
  size_t        kernel_base;
  size_t        kernel_end;
  int           *bootstrap_slots;
  size_t        ptr;
  int           i;


  // Allow dummy mallocs
  if (!size) {
    return NULL;
  }
  
  // Clamp requests up to 2-GB size. We won't support more, even for the
  // x86_64 port. It makes it easier to work internally with signed integers,
  // because error checking and assertions work way better.
  ar_assert (size < (1 << 31));

  // Get global context and boundaries
  my_cid = ar_get_core_id();
  context = mm_get_context(my_cid);
  kernel_base = mm_va_kernel_base(my_cid);
  kernel_end = kernel_base + MM_KERNEL_SIZE - 1024 * 1024;

  // Align size request to nearest allowed size
  if (size & (MM_ALLOC_ALIGN - 1)) {
    size = (size & ~(MM_ALLOC_ALIGN - 1)) + MM_ALLOC_ALIGN;
  }

  // Bootstrapping code
  if (context->mm_alloc_bootstrap) {
    bootstrap_slots = (int *) kernel_end;

    // We support only up to MM_BOOTSTRAP_MAX_SLOT sized requests; increase
    // that if this assertion fails (it means bigger objects are needed and
    // bootstrap has to allow this)
    ar_assert(size <= MM_BOOTSTRAP_MAX_SLOT);

    // Allocate directly from the beginning of the kernel heap, keeping track
    // on the last heap page
    i = size / MM_ALLOC_ALIGN - 1; // find slot
    ptr = kernel_base +                                // base address
          i * MM_SLAB_SIZE * MM_BOOTSTRAP_SLABS_STEP + // slabs per slot
          bootstrap_slots[i] * size;                   // slot address
    
    // Remember how many objects we've allocated
    bootstrap_slots[i]++;

    // Make sure enough slots can fit into MM_BOOTSTRAP_SLABS_STEP; otherwise,
    // the define must be increased
    ar_assert(bootstrap_slots[i] * size <= 
           MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE);

    return (void *) ptr;
  }


  // Do a normal allocation from the kernel pool
  if (mm_slab_alloc_slot(context->mm_kernel_pool, size, &ptr)) {
    // Kernel memory should never get full
    ar_abort();
  }
  ar_assert(ptr >= kernel_base);
  ar_assert(ptr <  kernel_end + MM_PAGE_SIZE);

  return (void *) ptr;
}
Exemplo n.º 19
0
// ===========================================================================
// kt_free()                    Frees an object. During bootstrap, it simply
//                              tracks it for freeing later on. Out of
//                              bootstrap, calls the slab allocator to free
//                              it.
// ===========================================================================
// * INPUTS
//   void *ptr                  Object to be freed
// ===========================================================================
void kt_free(void *ptr) {

  Context       *context;
  int           my_cid;
  size_t        kernel_base;
  size_t        kernel_end;
  int           *bootstrap_slots;
  int           *counter;
  void          **free_slots;
  int           slot_id;
  int           slot_offset;


  // Dummy free?
  if (!ptr) {
    return;
  }
  
  // Get global context and boundaries
  my_cid = ar_get_core_id();
  context = mm_get_context(my_cid);
  kernel_base = mm_va_kernel_base(my_cid);
  kernel_end = kernel_base + MM_KERNEL_SIZE - 1024 * 1024;

  // Sanity checks
  ar_assert (!((size_t) ptr & (MM_ALLOC_ALIGN - 1)));
  ar_assert((size_t) ptr >= kernel_base);
  ar_assert((size_t) ptr <  kernel_end + MM_PAGE_SIZE);

  // Get global context
  context = mm_get_context(ar_get_core_id());

  // Bootstrapping?
  if (context->mm_frees_bootstrap) {

    // Verify it's about a slot we actually gave
    bootstrap_slots = (int *) kernel_end;
    slot_id = ((size_t) ptr - kernel_base) / 
        (MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE);
    ar_assert(slot_id * MM_ALLOC_ALIGN <= MM_BOOTSTRAP_MAX_SLOT);
    ar_uint_divide((size_t) ptr - kernel_base -
                       (slot_id * MM_BOOTSTRAP_SLABS_STEP * MM_SLAB_SIZE),
                   (slot_id + 1) * MM_ALLOC_ALIGN,
                   (unsigned int *) &slot_offset, NULL);
    ar_assert(slot_offset < bootstrap_slots[slot_id]);
    
    // Record the free request
    counter = (int *) ((size_t) kernel_end - MM_PAGE_SIZE);
    free_slots = (void **) ((size_t) kernel_end - MM_PAGE_SIZE + sizeof(int));
    free_slots[*counter] = ptr;
    (*counter)++;
    
    // Make sure we don't overflow the array
    ar_assert((MM_PAGE_SIZE - sizeof(int)) / sizeof(void *) > *counter);

    return;
  }

  // Do a normal free from the kernel pool
  ar_assert(!mm_slab_free_slot(context->mm_kernel_pool, (size_t) ptr));
}
Exemplo n.º 20
0
// ===========================================================================
// function()                   FIXME comments
// ===========================================================================
// * INPUTS
//   unsigned char *arg1        Describe arg1
//   int arg2                   Describe arg2
//
// * OUTPUTS
//   int *arg3                  Describe arg3
//
// * RETURN VALUE
//   int                        0 for success
// ===========================================================================
int kmeans_mpi(int num_procs,          // MPI processors to use
               int num_clusters,       // number of output clusters
               int num_objects,        // number of total input objects
               int num_reps) {         // loop repetitions

  int           num_cores;
  int           rank;
  int           objects_per_core;
  float         *objects;
  int           *membership;
  float         *clusters;
  float         *partial_clusters;
  int           *partial_sizes;
  float         *reduce_clusters;
  int           *reduce_sizes;
  unsigned int  seed = 42;
  unsigned int  time_start = 0;
  unsigned int  time_stop;
  unsigned int  time;
  int           i;
  int           j;
  int           loop;


  // Who are we?
  MPI_Comm_size(MPI_COMM_WORLD, &num_cores);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  // Sanity checks
  if (num_cores < num_procs) {
    if (!rank) {
      kt_printf("Cannot run with %d cores, MPI setup has only %d cores\r\n",
                num_procs, num_cores);
    }
    return 1;
  }
  if (num_objects % num_procs) {
    kt_printf("%d objects not divisible by %d cores\r\n", 
              num_objects, num_procs);
    return 1;
  }
  objects_per_core = num_objects / num_procs;


  // Synchronize everyone and print infomercial
  MPI_Barrier(MPI_COMM_WORLD);
  if (!rank) {
    kt_printf("k-means of %d -> %d starting on %d core(s)\r\n",
              num_objects, num_clusters, num_procs);
  }
  MPI_Barrier(MPI_COMM_WORLD);




  // Create random input objects per core
  objects = kt_malloc(objects_per_core * COORDS * sizeof(float));
  for (i = 0; i < objects_per_core; i++) {
    for (j = 0; j < COORDS; j++) {
      objects[i * COORDS + j] = (float) ((seed = kt_rand(seed)) % 1000) / 10.0F;
    }
  }

  // Create membership, no object belongs to any cluster yet
  membership = kt_malloc(objects_per_core * sizeof(int));
  for (i = 0; i < objects_per_core; i++) {
    membership[i] = -1;
  }

  // Create partial cluster arrays, to be used as temporary buffers, and their
  // size arrays. Init everything to a neutral value, so they can be used
  // for reductions even with cores that are not part of the setup.
  partial_clusters = kt_malloc(num_clusters * COORDS * sizeof(float));
  partial_sizes = kt_malloc(num_clusters * sizeof(int));
  for (i = 0; i < num_clusters; i++) {
    partial_sizes[i] = 0;
    for (j = 0; j < COORDS; j++) {
      partial_clusters[i * COORDS + j] = 0.0F;
    }
  }

  // Rank 0 allocates extra buffers for the reductions
  if (!rank) {
    reduce_clusters = kt_malloc(num_clusters * COORDS * sizeof(float));
    reduce_sizes = kt_malloc(num_clusters * sizeof(int));
  }
  else {
    reduce_clusters = NULL;
    reduce_sizes = NULL;
  }


  // Allocate stable clusters (will be changed across loop repetitions) for
  // everyone. Prepare initial clusters for core 0 with centers copied from
  // first objects.
  clusters = kt_malloc(num_clusters * COORDS * sizeof(float *));

  if (!rank) {
    ar_assert(objects_per_core >= num_clusters);

    for (i = 0; i < num_clusters; i++) {
      for (j = 0; j < COORDS; j++) {
        clusters[i * COORDS + j] = objects[i * COORDS + j];
      }
    }
  }


  // Keep time
  MPI_Barrier(MPI_COMM_WORLD);
  if (!rank) {
    time_start = ar_free_timer_get_ticks();
  }


  // For all repetitions
  for (loop = 0; loop < num_reps; loop++) {

    // Broadcast clusters for this rep from core 0 to everyone
    MPI_Bcast(clusters, num_clusters * COORDS, MPI_FLOAT, 0, MPI_COMM_WORLD);

    // Process our objects (only for cores belonging in the setup).
    if (rank < num_procs) {
      kmeans_mpi_do_tile(objects, membership, partial_clusters, partial_sizes, 
                         clusters, objects_per_core, num_clusters);
    }

    // Reduce all results to rank 0 (non-setup cores will add their 0 values)
    MPI_Reduce(partial_clusters, reduce_clusters, num_clusters * COORDS,
               MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(partial_sizes, reduce_sizes, num_clusters,
               MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    // Rank 0 prepares the next clusters, based on reduction values
    if (!rank) {
      kt_printf("done %d of %d\r\n", loop + 1, num_reps);
      for (i = 0; i < num_clusters; i++) {
        for (j = 0; j < COORDS; j++) {
          clusters[i * COORDS + j] = reduce_clusters[i * COORDS + j] /
                                     (float) reduce_sizes[i];
        }
      }
    }
  }


  MPI_Barrier(MPI_COMM_WORLD);

  // Compute elapsed time
  if (!rank) {
    time_stop = ar_free_timer_get_ticks();
    if (time_stop > time_start) {
      time = time_stop - time_start;
    }
    else {
      time = 0xFFFFFFFF - (time_start - time_stop);
    }
    kt_printf("Time: %10u cycles (%6u msec)\r\n", time, time / 10000);

    // Print clusters
    for (i = 0; i < num_clusters; i++) {
      kt_printf("Cluster %d: %f %f %f\r\n", 
                i, 
                clusters[i * COORDS], 
                clusters[i * COORDS + 1], 
                clusters[i * COORDS + 2]);
    }
  }

  // Free stuff
  kt_free(clusters);
  kt_free(partial_sizes);
  kt_free(partial_clusters);
  kt_free(reduce_sizes);
  kt_free(reduce_clusters);
  kt_free(objects);
  kt_free(membership);

  return 0;
}
Exemplo n.º 21
0
// ===========================================================================
// _sys_spawn()                 Spawns a new task for parallel execution.
// ===========================================================================
// * INPUTS
//   char *filename             Source code filename where this call is done
//   int line_nr                Line number in filename this call is done
//   unsigned int idx           Which task to run, specified as an index
//                              to a global function pointer table named
//                              void (*_sys_task_table[])
//   void **args                Array of in-order task arguments. All arguments
//                              must fit into a void * typecast.
//   unsigned int *types        Array of in-order argument types. Each type
//                              is a bitmask of SYS_TYPE_* properties as 
//                              defined in syscall.h header.
//   unsigned int num_args      Number of task arguments
// ===========================================================================
void _sys_spawn(char *filename, int line_nr, unsigned int idx, void **args, 
                unsigned int *types, unsigned int num_args) {

  Context       *context;
  PrMsgReq      *req;
  ListNode      *node;
  PrTaskDescr   *ptask;
  PrEvtPending  *event;
  int           i;
  int           j;


  // Get context
  context = mm_get_context(ar_get_core_id());

  // What's the parent task?
  node = kt_list_head(context->pr_ready_queue);
  ar_assert(node);
  ptask = node->data;
  ar_assert(ptask);

  // Sanity checks
  ar_assert(idx < (1 << PR_TASK_IDX_SIZE));
  ar_assert(num_args > 0); // No task should be declared without a memory
                           // footprint. If for some reason this is desirable,
                           // the while loop below must be altered (it won't
                           // work for num_args == 0).

//kt_printf("%d: Parent task = 0x%X (idx = %d), spawning task idx = %d\r\n",
//          context->pr_core_id, ptask->id, ptask->index, idx);


  // Single-core mode?
  if (context->pr_num_cores == 1) {
    // FIXME: don't involve scheduler, ar_exec() directly...
    ar_abort();
  }


  // Make sure we haven't got any previous spawn request pending. We throttle
  // the spawn rate here, in order to avoid race conditions: we are allowed
  // to spawn a new task only when the scheduler has signalled (through a 
  // REPLY_SPAWN) that the old task spawn has progressed up to a safe point
  // which will not create any race.
  if (context->pr_spawn_pending) {
    ar_assert(!pr_event_worker_inner_loop(1, 0, 0, NULL));
  }

  // Mark that a new spawn request is now pending
  ar_assert(!context->pr_spawn_pending);
  context->pr_spawn_pending = 1;


  // Enter multi-part request creation loop
  i = 0;
  j = 0;
  req = NULL;
  while (i < num_args) {

    // New message part?
    if (!j) {

      // Get a request buffer
      req = noc_msg_send_get_buf(pr_scheduler_core_id(
                                                context->pr_parent_sched_id));

      // Build new message
      req->core_id     = context->pr_core_id;
      req->req_id      = context->pr_message_id; // same ID for all parts
      req->type        = EXT_REQ_SPAWN;
      req->size        = 0;                 // init I/O type bitmap
      req->region      = 0;                 // init region/byvalue type bitmap
      req->ptr         = (void *) ((ptask->id << PR_TASK_IDX_SIZE) | idx);
      req->num_regions = (i != 0);          // new request part?
    }

    // Fill out next argument
    req->data[j] = args[i];

    req->size |= (types[i] & SYS_TYPE_IN_ARG)  ? (1 << (2 * j))     : 0;
    req->size |= (types[i] & SYS_TYPE_OUT_ARG) ? (1 << (2 * j + 1)) : 0;

    req->region |= (types[i] & SYS_TYPE_SAFE_ARG)   ? (1 << (2 * j))     : 0;
    req->region |= (types[i] & SYS_TYPE_REGION_ARG) ? (1 << (2 * j + 1)) : 0;

//kt_printf("Spawn arg %d: %s %s %s %s\r\n", i,
//    (types[i] & SYS_TYPE_IN_ARG) ? "IN" : "",
//    (types[i] & SYS_TYPE_OUT_ARG) ? "OUT" : "",
//    (types[i] & SYS_TYPE_SAFE_ARG) ? "SAFE" : "",
//    (types[i] & SYS_TYPE_REGION_ARG) ? "REGION" : ""
//    );

    i++;
    j++;

    // Finished with this part?
    if ((j == PR_REQ_MAX_SIZE) || (i == num_args)) {

      // More to follow?
      if ((j == PR_REQ_MAX_SIZE) && (i < num_args)) {
        req->num_ptrs = -1;
      }
      else {
        req->num_ptrs = j;
      }

      // Send message to scheduler
      ar_assert(!noc_msg_send());

      // Finished with this request buffer
      j = 0;
      req = NULL;
    }
  }

  // Create a note-to-self event to wait for the reply of this spawn request
  event = kt_malloc(sizeof(PrEvtPending));
  event->req = kt_malloc(sizeof(PrMsgReq));

  event->req->core_id = -1;
  event->req->req_id  = -1;
  event->req->type = SELF_WAIT_SPAWN;
  event->action = PR_ACT_REDO;
  event->prev = NULL;
  event->next = NULL;
  event->data = NULL;
  
  // Store event; we don't expect conflicts on this message ID.
  ar_assert(!kt_trie_insert(context->pr_pending_events, context->pr_message_id, 
                            event));

  // Increase message ID, avoiding value 0 on wrap-arounds
  context->pr_message_id = pr_advance_msg_id(context->pr_message_id);
}