Пример #1
0
void core_power_DPM_t::translate_params(system_core *core_params, system_L2 *L2_params)
{
  core_power_t::translate_params(core_params, L2_params);

  struct core_knobs_t *knobs = core->knobs;
  core_params->machine_type = 0; // OoO
  core_params->number_hardware_threads = 2;
  core_params->number_instruction_fetch_ports = 2;
  core_params->fp_issue_width = 2;
  core_params->prediction_width = 1;
  core_params->pipelines_per_core[0] = 1;
  core_params->pipelines_per_core[1] = 1;
  core_params->pipeline_depth[0] = 15;
  core_params->pipeline_depth[1] = 15;

  core_params->instruction_window_scheme = 1; //RSBASED 0; // PHYREG
  core_params->instruction_window_size = knobs->exec.RS_size;
  core_params->archi_Regs_IRF_size = 16;
  core_params->archi_Regs_FRF_size = 32;
  core_params->phy_Regs_IRF_size = 256;
  core_params->phy_Regs_FRF_size = 256;
  core_params->rename_scheme = 0; //RAM-based
  core_params->register_windows_size = 0;
  strcpy(core_params->LSU_order, "inorder");
  core_params->memory_ports = 2;

  // private L2 data cache
  if (core->memory.DL2)
  {
    zesto_assert(L2_params != NULL, (void)0);

    L2_params->L2_config[0] = core->memory.DL2->sets * core->memory.DL2->assoc * core->memory.DL2->linesize;
    L2_params->L2_config[1] = core->memory.DL2->linesize;
    L2_params->L2_config[2] = core->memory.DL2->assoc;
    L2_params->L2_config[3] = core->memory.DL2->banks;
    L2_params->L2_config[5] = core->memory.DL2->latency;
    // See LLC comment in base class for setting throughput == latency.
    L2_params->L2_config[4] = L2_params->L2_config[5];
    L2_params->L2_config[6] = core->memory.DL2->bank_width;
    L2_params->L2_config[7] = (core->memory.DL2->write_policy == WRITE_THROUGH) ? 0 : 1;

    L2_params->device_type = XML->sys.device_type;

    L2_params->ports[0] = 0;
    L2_params->ports[1] = 0;
    L2_params->ports[2] = 1;

    // # MSHRs
    L2_params->buffer_sizes[0] = core->memory.DL2->MSHR_size;
    // # fill buffers
    L2_params->buffer_sizes[1] = core->memory.DL2->heap_size;
    // # PF buffers
    L2_params->buffer_sizes[2] = core->memory.DL2->PFF_size;
    // # WB buffers
    L2_params->buffer_sizes[3] = core->memory.DL2->MSHR_WB_size;
  }
}
Пример #2
0
void core_power_DPM_t::translate_stats(xiosim::stats::StatsDatabase* sdb,
                                       system_core* core_stats,
                                       system_L2* L2_stats) {
  core_power_t::translate_stats(sdb, core_stats, L2_stats);

  xiosim::stats::Statistic<counter_t>* stat = nullptr;
  int coreID = core->id;

  stat = stat_find_core_stat<counter_t>(sdb, coreID, "fetch_uops");
  core_stats->int_instructions = stat->get_final_val();
  core_stats->fp_instructions = 0;

  stat = stat_find_core_stat<counter_t>(sdb, coreID, "commit_uops");
  core_stats->committed_int_instructions = stat->get_final_val();
  core_stats->committed_fp_instructions = 0;

  stat = stat_find_core_stat<counter_t>(sdb, coreID, "commit_uops");
  core_stats->ROB_reads = stat->get_final_val();
  stat = stat_find_core_stat<counter_t>(sdb, coreID, "ROB_writes");
  core_stats->ROB_writes = stat->get_final_val();
  stat = stat_find_core_stat<counter_t>(sdb, coreID, "regfile_reads");
  core_stats->rename_reads = stat->get_final_val();
  stat = stat_find_core_stat<counter_t>(sdb, coreID, "regfile_writes");
  core_stats->rename_writes = stat->get_final_val();
  stat = stat_find_core_stat<counter_t>(sdb, coreID, "fp_regfile_reads");
  core_stats->fp_rename_reads = stat->get_final_val();
  stat = stat_find_core_stat<counter_t>(sdb, coreID, "fp_regfile_writes");
  core_stats->fp_rename_writes = stat->get_final_val();

  stat = stat_find_core_stat<counter_t>(sdb, coreID, "alloc_uops");
  core_stats->inst_window_reads = stat->get_final_val();
  stat = stat_find_core_stat<counter_t>(sdb, coreID, "alloc_uops");
  core_stats->inst_window_writes = stat->get_final_val();
  core_stats->inst_window_wakeup_accesses = 0;
  core_stats->fp_inst_window_reads = 0;
  core_stats->fp_inst_window_writes = 0;
  core_stats->fp_inst_window_wakeup_accesses = 0;

  stat = stat_find_core_stat<counter_t>(sdb, coreID, "oracle_total_calls");
  core_stats->context_switches = stat->get_final_val();

  if (core->memory.DL2)
  {
    zesto_assert(L2_stats != NULL, (void)0);

    stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.load_lookups");
    L2_stats->read_accesses = stat->get_final_val();
    stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.load_misses");
    L2_stats->read_misses = stat->get_final_val();
    stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.store_lookups");
    L2_stats->write_accesses = stat->get_final_val();
    stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.store_misses");
    L2_stats->write_misses = stat->get_final_val();
  }
}
Пример #3
0
/* In-order instruction commit.  Individual uops cannot commit
   until it is guaranteed that the entire Mop's worth of uops will
   commit. */
void core_commit_IO_DPM_t::IO_step(void)
{
  struct core_knobs_t * knobs = core->knobs;
  int commit_count = 0;
  stall_reason = CSTALL_NONE;
  int branches_committed = 0;

  /* This is just a deadlock watchdog. If something got messed up
     in the pipeline and no forward progress is being made, this
     code will eventually detect it. A global watchdog will check
     if any core is making progress and accordingly if not.*/
  if(core->current_thread->active && ((core->sim_cycle - core->exec->last_completed) > deadlock_threshold))
  {
    deadlocked = true; 
#ifdef ZTRACE
    ztrace_print(core->id, "Possible deadlock detected.");
#endif
    return;
  }

  /* deallocate at most one store from the (senior) STQ per cycle */
  core->exec->STQ_deallocate_senior();

  /* MAIN COMMIT LOOP */
  for(commit_count=0;commit_count<knobs->commit.width;commit_count++)
  {
    if(ROB_num <= 0) /* nothing to commit */
    {
      stall_reason = CSTALL_EMPTY;
      break;
    }

    struct Mop_t * Mop = ROB[ROB_head]->Mop; 

    /* For branches, don't commit until the corresponding jeclear
       (if any) has been processed by the front-end. */
    if(Mop->commit.jeclear_in_flight)
    {
      stall_reason = CSTALL_JECLEAR_INFLIGHT;
      break;
    }

    if(Mop->decode.is_ctrl && knobs->commit.branch_limit && (branches_committed >= knobs->commit.branch_limit))
    {
      stall_reason = CSTALL_MAX_BRANCHES;
      break;
    }

    if(Mop->oracle.spec_mode)
//      zesto_fatal("oldest instruction in processor is on wrong-path",(void)0);
      zesto_assert(false, (void)0);

    /* Are all uops in the Mop completed? */
    if(Mop->commit.complete_index != -1) /* still some outstanding insts */
    {
      struct uop_t * uop = &Mop->uop[Mop->commit.complete_index];
    
      while(uop->timing.when_completed <= core->sim_cycle
            || uop->decode.is_sta || uop->decode.is_std)
      {
        /* stores get added to the STQ at commit */
        if(uop->decode.is_sta)
        {  
          if(!core->exec->exec_fused_ST(uop))
          {
             stall_reason = CSTALL_STQ;
             break;
          }
        }

        zesto_assert(uop->timing.when_completed <= core->sim_cycle, (void)0);

        Mop->commit.complete_index += uop->decode.has_imm ? 3 : 1;
        if(Mop->commit.complete_index >= Mop->decode.flow_length)
        {
          Mop->commit.complete_index = -1; /* Mark this Mop as all done */
#ifdef ZTRACE
          ztrace_print(Mop,"c|complete|all uops completed execution");
#endif
          if(Mop->fetch.bpred_update)
          {
            core->fetch->bpred->update(Mop->fetch.bpred_update, Mop->decode.opflags,
                Mop->fetch.PC, Mop->fetch.ftPC, Mop->decode.targetPC, Mop->oracle.NextPC, Mop->oracle.taken_branch);
            core->fetch->bpred->return_state_cache(Mop->fetch.bpred_update);
            Mop->fetch.bpred_update = NULL;
          }
          break;
        }

        uop = &Mop->uop[Mop->commit.complete_index];
      }
    }

    if(stall_reason != CSTALL_NONE) break;

    if(Mop->commit.complete_index == -1) /* commit the uops if the Mop is done */
    {
      struct uop_t * uop = ROB[ROB_head];
      zesto_assert(uop->timing.when_completed <= core->sim_cycle,(void)0); 
      zesto_assert(uop->alloc.ROB_index == ROB_head,(void)0);
      zesto_assert(uop == &Mop->uop[Mop->commit.commit_index],(void)0);

      if(uop->decode.BOM && (uop->Mop->timing.when_commit_started == TICK_T_MAX))
        uop->Mop->timing.when_commit_started = core->sim_cycle;


      //SK - load deallocation moved to end of payload pipe
      if(uop->decode.is_sta)
        core->exec->STQ_deallocate_sta();
      
      if(uop->decode.is_std) /* we alloc on STA, dealloc on STD */
      {
        if(!core->exec->STQ_deallocate_std(uop))
        {
          stall_reason = CSTALL_STQ;
          break;
        }
      }

      /* any remaining transactions in-flight (only for loads)
         should now be ignored - such load requests may exist, for
         example as a result of a load that completes early due to
         a hit in the STQ while the cache request is still making
         its way through the memory hierarchy. */
      if(uop->decode.is_load)
        uop->exec.action_id = core->new_action_id();

#ifdef ZTRACE
      ztrace_print(uop,"c|commit|uop committed");
#endif

      if(uop->decode.EOM)
        uop->Mop->timing.when_commit_finished = core->sim_cycle;

      /* remove uop from ROB */
      if((!uop->decode.in_fusion) || (uop->decode.fusion_next == NULL)) /* fusion dealloc's on fusion-tail */
      {
        ROB[ROB_head] = NULL;
        ROB_num --;
        ROB_eff_num --;
        ROB_head = modinc(ROB_head,knobs->commit.ROB_size); //(ROB_head+1) % knobs->commit.ROB_size;
        if(uop->decode.in_fusion)
        {
          ZESTO_STAT(core->stat.commit_fusions++;)
        }
      }
      else /* fusion body doesn't count toward commit width */
      {
Пример #4
0
/* In-order instruction commit.  Individual uops cannot commit
   until it is guaranteed that the entire Mop's worth of uops will
   commit. */
void core_commit_STM_t::step(void)
{
  struct core_knobs_t * knobs = core->knobs;
  int commit_count = 0;
  enum commit_stall_t stall_reason = CSTALL_NONE;

  /* This is just a deadlock watchdog. If something got messed up
     in the pipeline and no forward progress is being made, this
     code will eventually detect it. A global watchdog will check
     if any core is making progress and accordingly if not.*/
  if(core->active && ((core->sim_cycle - core->exec->last_completed) > deadlock_threshold))
  {
    deadlocked = true; 
#ifdef ZTRACE
    ztrace_print(core->id, "Possible deadlock detected.");
#endif
    return;
  }

  /* MAIN COMMIT LOOP */
  for(commit_count=0;commit_count<knobs->commit.width;commit_count++)
  {
    if(ROB_num <= 0) /* nothing to commit */
    {
      stall_reason = commit_count?CSTALL_NONE:CSTALL_EMPTY;
      break;
    }

    struct Mop_t * Mop = ROB[ROB_head]->Mop;

    if(Mop->oracle.spec_mode)
      fatal("oldest instruction in processor is on wrong-path");

    /* Are all uops in the Mop completed? */
    if(Mop->commit.complete_index != -1) /* still some outstanding insts */
    {
      while(Mop->uop[Mop->commit.complete_index].timing.when_completed <= core->sim_cycle)
      {
        struct uop_t * uop = &Mop->uop[Mop->commit.complete_index];

        Mop->commit.complete_index += uop->decode.has_imm ? 3 : 1;
        if(Mop->commit.complete_index >= (int) Mop->decode.flow_length)
        {
          Mop->commit.complete_index = -1; /* Mark this Mop as all done */
          if(Mop->fetch.bpred_update)
          {
            core->fetch->bpred->update(Mop->fetch.bpred_update, Mop->decode.opflags,
                Mop->fetch.PC, Mop->fetch.ftPC, Mop->decode.targetPC, Mop->oracle.NextPC, Mop->oracle.taken_branch);
            core->fetch->bpred->return_state_cache(Mop->fetch.bpred_update);
            Mop->fetch.bpred_update = NULL;
          }
          break;
        }
      }
    }

    if(Mop->commit.complete_index == -1) /* commit the uops if the Mop is done */
    {
      struct uop_t * uop = ROB[ROB_head];
      zesto_assert(uop->timing.when_completed <= core->sim_cycle,(void)0);
      zesto_assert(uop->alloc.ROB_index == ROB_head,(void)0);
      zesto_assert(uop == &Mop->uop[Mop->commit.commit_index],(void)0);

      if(uop->decode.BOM && (uop->Mop->timing.when_commit_started == TICK_T_MAX))
        uop->Mop->timing.when_commit_started = core->sim_cycle;

      if(uop->decode.is_load)
        core->exec->LDQ_deallocate(uop);
      else if(uop->decode.is_sta)
        core->exec->STQ_deallocate_sta();
      else if(uop->decode.is_std) /* we alloc on STA, dealloc on STD */
      {
        if(!core->exec->STQ_deallocate_std(uop))
          break;
      }

      /* any remaining transactions in-flight (only for loads)
         should now be ignored - such load requests may exist, for
         example as a result of a load that completes early due to
         a hit in the STQ while the cache request is still making
         its way through the memory hierarchy. */
      if(uop->decode.is_load)
        uop->exec.action_id = core->new_action_id();

      if(uop->decode.EOM)
        uop->Mop->timing.when_commit_finished = core->sim_cycle;

      /* remove uop from ROB */
      ROB[ROB_head] = NULL;
      ROB_num --;
      ROB_head = modinc(ROB_head,knobs->commit.ROB_size); //(ROB_head+1) % knobs->commit.ROB_size;
      uop->alloc.ROB_index = -1;

      /* this cleans up idep/odep ptrs, register mappings, and
         commit stores to the real (non-spec) memory system */
      core->oracle->commit_uop(uop);

      /* mark uop as committed in Mop */
      Mop->commit.commit_index += uop->decode.has_imm ? 3 : 1;

      if(Mop->commit.commit_index >= (int) Mop->decode.flow_length)
      {
        Mop->commit.commit_index = -1; /* The entire Mop has been committed */

        /* Update stats */
        if(Mop->uop[Mop->decode.last_uop_index].decode.EOM)
        {
          ZESTO_STAT(core->stat.commit_insn++;)
        }

        ZESTO_STAT(core->stat.commit_uops += Mop->stat.num_uops;)
        ZESTO_STAT(core->stat.commit_refs += Mop->stat.num_refs;)
Пример #5
0
void core_alloc_DPM_t::step(void)
{
  struct core_knobs_t * knobs = core->knobs;
  int stage, i;
  enum alloc_stall_t stall_reason = ASTALL_NONE;

  /*========================================================================*/
  /*== Dispatch insts if ROB, RS, and LQ/SQ entries available (as needed) ==*/
  stage = knobs->alloc.depth-1;
  if(occupancy[stage]) /* are there uops in the last stage of the alloc pipe? */
  {
    for(i=0; i < knobs->alloc.width; i++) /* if so, scan all slots (width) of this stage */
    {
      struct uop_t * uop = pipe[stage][i];
      int abort_alloc = false;

      /* if using drain flush: */
      /* is the back-end still draining from a misprediction recovery? */
      if(knobs->alloc.drain_flush && drain_in_progress)
      {
        if(!core->commit->ROB_empty())
        {
          stall_reason = ASTALL_DRAIN;
          break;
        }
        else
          drain_in_progress = false;
      }

      if(uop)
      {
        while(uop) /* this while loop is to handle multiple uops fused together into the same slot */
        {
          if(uop->timing.when_allocated == TICK_T_MAX)
          {
            /* is the ROB full? */
            if((!uop->decode.in_fusion||uop->decode.fusion_head) && !core->commit->ROB_available())
            {
              stall_reason = ASTALL_ROB;
              abort_alloc = true;
              break;
            }

            /* for loads, is the LDQ full? */
            if((uop->decode.is_load || uop->decode.is_lfence) && !core->exec->LDQ_available())
            {
              stall_reason = ASTALL_LDQ;
              abort_alloc = true;
              break;
            }
            /* for stores, allocate STQ entry on STA.  NOTE: This is different from
               Bob Colwell's description in Shen&Lipasti Chap 7 where he describes
               allocation on STD.  We emit STA uops first since the oracle needs to
               use the STA result to feed the following STD uop. */
            if((uop->decode.is_sta || uop->decode.is_sfence) && !core->exec->STQ_available())
            {
              stall_reason = ASTALL_STQ;
              abort_alloc = true;
              break;
            }

            /* is the RS full? -- don't need to alloc for NOPs,fences, signals */
            if(!core->exec->RS_available() && !uop->decode.is_nop &&
                !uop->decode.is_lfence && !uop->decode.is_sfence &&
                !is_uop_helix_signal(uop))
            {
              stall_reason = ASTALL_RS;
              abort_alloc = true;
              break;
            }

            /* ALL ALLOC STALL CONDITIONS PASSED */

            /* place in ROB */
            if((!uop->decode.in_fusion) || uop->decode.is_fusion_head)
              core->commit->ROB_insert(uop);
            else /* fusion body doesn't occupy additional ROB entries */
              core->commit->ROB_fuse_insert(uop);

            /* place in LDQ/STQ if needed */
            if(uop->decode.is_load || uop->decode.is_lfence)
              core->exec->LDQ_insert(uop);
            else if(uop->decode.is_sta || uop->decode.is_sfence)
              core->exec->STQ_insert_sta(uop);
            else if(uop->decode.is_std)
              core->exec->STQ_insert_std(uop);

            /* port bindings */
            if(!uop->decode.is_nop && !uop->Mop->decode.is_trap &&
                !uop->decode.is_lfence && !uop->decode.is_sfence &&
                !is_uop_helix_signal(uop))
            {
              /* port-binding is trivial when there's only one valid port */
              if(knobs->exec.port_binding[uop->decode.FU_class].num_FUs == 1)
              {
                uop->alloc.port_assignment = knobs->exec.port_binding[uop->decode.FU_class].ports[0];
              }
              else /* else assign uop to least loaded port */
              {
                int min_load = INT_MAX;
                int index = -1;
                for(int j=0;j<knobs->exec.port_binding[uop->decode.FU_class].num_FUs;j++)
                {
                  int port = knobs->exec.port_binding[uop->decode.FU_class].ports[j];
                  if(port_loading[port] < min_load)
                  {
                    min_load = port_loading[port];
                    index = port;
                  }
                }
                uop->alloc.port_assignment = index;
              }
              port_loading[uop->alloc.port_assignment]++;

              /* only allocate for non-fused or fusion-head */
              if((!uop->decode.in_fusion) || uop->decode.is_fusion_head)
                core->exec->RS_insert(uop);
              else
                core->exec->RS_fuse_insert(uop);

              /* Get input mappings - this is a proxy for explicit register numbers, which
                 you can always get from idep_uop->alloc.ROB_index */
              for(size_t j=0;j<MAX_IDEPS;j++)
              {
                /* This use of oracle info is valid: at this point the processor would be
                   looking up this information in the RAT, but this saves us having to
                   explicitly store/track the RAT state. */
                uop->exec.idep_uop[j] = uop->oracle.idep_uop[j];

                /* Add self onto parent's output list.  This output list doesn't
                   have a real microarchitectural counter part, but it makes the
                   simulation faster by not having to perform a whole mess of
                   associative searches each time any sort of broadcast is needed.
                   The parent's odep list only points to uops which have dispatched
                   into the OOO core (i.e. has left the alloc pipe). */
                if(uop->exec.idep_uop[j])
                {
                  struct odep_t * odep = core->get_odep_link();
                  odep->next = uop->exec.idep_uop[j]->exec.odep_uop;
                  uop->exec.idep_uop[j]->exec.odep_uop = odep;
                  odep->uop = uop;
                  //odep->aflags = (uop->decode.idep_name[j] == DCREG(MD_REG_AFLAGS));
                  odep->op_num = j;
                }
              }

              /* Update read stats */
              for(size_t j=0;j<MAX_IDEPS;j++)
              {
                if(x86::is_ireg(uop->decode.idep_name[j]))
                  core->stat.regfile_reads++;
                else if(x86::is_freg(uop->decode.idep_name[j]))
                  core->stat.fp_regfile_reads++;
              }

              /* check "scoreboard" for operand readiness (we're not actually
                 explicitly implementing a scoreboard); if value is ready, read
                 it into data-capture window or payload RAM. */
              tick_t when_ready = 0;
              for(size_t j=0;j<MAX_IDEPS;j++) /* for possible input argument */
              {
                if(uop->exec.idep_uop[j]) /* if the parent uop exists (i.e., still in the processor) */
                {
                  uop->timing.when_itag_ready[j] = uop->exec.idep_uop[j]->timing.when_otag_ready;
                  if(uop->exec.idep_uop[j]->exec.ovalue_valid)
                  {
                    uop->timing.when_ival_ready[j] = uop->exec.idep_uop[j]->timing.when_completed;
                    uop->exec.ivalue_valid[j] = true;
                  }
                }
                else /* read from ARF */
                {
                  uop->timing.when_itag_ready[j] = core->sim_cycle;
                  uop->timing.when_ival_ready[j] = core->sim_cycle;
                  uop->exec.ivalue_valid[j] = true; /* applies to invalid (DNA) inputs as well */
                }
                if(when_ready < uop->timing.when_itag_ready[j])
                  when_ready = uop->timing.when_itag_ready[j];
              }
              uop->timing.when_ready = when_ready;
              if(when_ready < TICK_T_MAX) /* add to readyQ if appropriate */
                core->exec->insert_ready_uop(uop);


            }
            else /* is_nop || is_trap || is_lfence || is_sfence */
            {
              /* NOP's don't go through exec pipeline; they go straight to the
                 ROB and are immediately marked as completed (they still take
                 up space in the ROB though). */
              /* Since traps/interrupts aren't really properly modeled in SimpleScalar, we just let
                 it go through without doing anything. */
              /* Similarly fences don't need to go through RS and go straight to
                 the LDQ */
              uop->timing.when_ready = core->sim_cycle;
              uop->timing.when_issued = core->sim_cycle;
              if (!uop->decode.is_lfence)
                uop->timing.when_completed = core->sim_cycle;
              if (uop->decode.is_lfence || uop->decode.is_sfence ||
                  is_uop_helix_signal(uop) || uop->decode.is_nop)
                uop->timing.when_exec = core->sim_cycle;

              if (is_uop_helix_signal(uop) && uop->decode.is_sta)
                core->exec->STQ_set_addr(uop);
              if (is_uop_helix_signal(uop) && uop->decode.is_std)
                core->exec->STQ_set_data(uop);

              zesto_assert(!uop->decode.is_load, (void)0);
            }

            uop->timing.when_allocated = core->sim_cycle;

#ifdef ZTRACE
            ztrace_print_start(uop,"a|alloc:ROB=%d,",uop->alloc.ROB_index);
            if(uop->alloc.RS_index == -1) // nop
              ztrace_print_cont(core->id, "RS=.");
            else
              ztrace_print_cont(core->id, "RS=%d",uop->alloc.RS_index);
            if(uop->decode.in_fusion && !uop->decode.is_fusion_head)
              ztrace_print_cont(core->id, "f");
            if(uop->alloc.LDQ_index == -1)
              ztrace_print_cont(core->id, ":LDQ=.");
            else
              ztrace_print_cont(core->id, ":LDQ=%d",uop->alloc.LDQ_index);
            if(uop->alloc.STQ_index == -1)
              ztrace_print_cont(core->id, ":STQ=.");
            else
              ztrace_print_cont(core->id, ":STQ=%d",uop->alloc.STQ_index);
            ztrace_print_cont(core->id, ":pb=%d",uop->alloc.port_assignment);
            ztrace_print_finish(core->id, "|uop alloc'd and dispatched");
#endif

          }

          if(uop->decode.in_fusion)
            uop = uop->decode.fusion_next;
          else
            uop = NULL;
        }

        if(abort_alloc)
          break;

        if((!pipe[stage][i]->decode.in_fusion) || !uop) /* either not fused, or complete fused uops alloc'd */
        {
          uop = pipe[stage][i]; /* may be NULL if we just finished a fused set */

          /* update stats */
          if(uop->decode.EOM)
            ZESTO_STAT(core->stat.alloc_insn++;)

          ZESTO_STAT(core->stat.alloc_uops++;)
          if(uop->decode.in_fusion)
            ZESTO_STAT(core->stat.alloc_eff_uops += uop->decode.fusion_size;)
          else
            ZESTO_STAT(core->stat.alloc_eff_uops++;)