void frm_grid_run(struct frm_grid_t *grid) { struct frm_threadblock_t *threadblock, *threadblock_next; struct frm_warp_t *warp, *warp_next; unsigned long long int cycle = 0; /* Set all ready threadblocks to running */ while ((threadblock = grid->pending_list_head)) { frm_threadblock_clear_status(threadblock, frm_threadblock_pending); frm_threadblock_set_status(threadblock, frm_threadblock_running); } /* Set is in state 'running' */ frm_grid_clear_status(grid, frm_grid_pending); frm_grid_set_status(grid, frm_grid_running); /* Execution loop */ while (grid->running_list_head) { /* Stop if maximum number of GPU cycles exceeded */ if (frm_emu_max_cycles && cycle >= frm_emu_max_cycles) esim_finish = esim_finish_frm_max_cycles; /* Stop if maximum number of GPU instructions exceeded */ if (frm_emu_max_inst && frm_emu->inst_count >= frm_emu_max_inst) esim_finish = esim_finish_frm_max_inst; /* Stop if any reason met */ if (esim_finish) break; /* Next cycle */ cycle++; /* Execute an instruction from each work-group */ for (threadblock = grid->running_list_head; threadblock; threadblock = threadblock_next) { /* Save next running work-group */ threadblock_next = threadblock->running_list_next; /* Run an instruction from each warp */ for (warp = threadblock->running_list_head; warp; warp = warp_next) { /* Save next running warp */ warp_next = warp->running_list_next; /* Execute instruction in warp */ frm_warp_execute(warp); } } } /* Dump stats */ frm_grid_dump(grid, stdout); /* Stop if maximum number of functions reached */ //if (frm_emu_max_functions && frm_emu->grid_count >= frm_emu_max_functions) // x86_emu_finish = x86_emu_finish_max_gpu_functions; }
int FrmEmuRun(Emu *self) { FrmEmu *emu = asFrmEmu(self); struct frm_grid_t *grid; struct frm_thread_block_t *thread_block; struct frm_warp_t *warp; /* Stop emulation if no grid needs running */ if (!list_count(emu->grids)) return FALSE; /* Remove grid and its thread blocks from pending list, and add them to * running list */ while ((grid = list_head(emu->pending_grids))) { while ((thread_block = list_head(grid->pending_thread_blocks))) { list_remove(grid->pending_thread_blocks, thread_block); list_add(grid->running_thread_blocks, thread_block); } list_remove(emu->pending_grids, grid); list_add(emu->running_grids, grid); } /* Run one instruction */ while ((grid = list_head(emu->running_grids))) { while ((thread_block = list_head(grid->running_thread_blocks))) { while ((warp = list_head(thread_block->running_warps))) { if (warp->finished || warp->at_barrier) continue; frm_warp_execute(warp); } } } /* Free finished grids */ assert(list_count(emu->pending_grids) == 0 && list_count(emu->running_grids) == 0); while ((grid = list_head(emu->finished_grids))) { /* Dump grid report */ frm_grid_dump(grid, frm_emu_report_file); /* Remove grid from finished list */ list_remove(emu->finished_grids, grid); /* Free grid */ frm_grid_free(grid); } /* Continue emulation */ return TRUE; }
void frm_sm_fetch(struct frm_sm_t *sm, int wiq_id) { int j; int instructions_processed = 0; int thread_id; struct frm_warp_t *warp; struct frm_thread_t *thread; struct frm_uop_t *uop; struct frm_thread_uop_t *thread_uop; struct frm_warp_inst_queue_entry_t *warp_inst_queue_entry; char inst_str[1024]; char inst_str_trimmed[1024]; warp = sm->warp_inst_queues[wiq_id]->entries[0]->warp; /* No warp */ if (!warp) return; /* Sanity check warp */ assert(warp->warp_inst_queue_entry); /* If instruction is ready the next cycle */ if (warp->warp_inst_queue_entry->ready_next_cycle) { warp->warp_inst_queue_entry->ready = 1; warp->warp_inst_queue_entry->ready_next_cycle = 0; return; } /* Only fetch a fixed number of instructions per cycle */ if (instructions_processed == frm_gpu_fe_fetch_width) return; /* WIQ entry not ready */ if (!warp->warp_inst_queue_entry->ready) return; /* If the warp finishes, there still may be outstanding * memory operations, so if the entry is marked finished * the warp must also be finished, but not vice-versa */ if (warp->warp_inst_queue_entry->warp_finished) { assert(warp->finished); return; } /* Warp is finished but other warps from thread block * remain. There may still be outstanding memory operations, * but no more instructions should be fetched. */ if (warp->finished) return; /* Warp is ready but waiting on outstanding * memory instructions */ if (warp->warp_inst_queue_entry->wait_for_mem) { if (!warp->warp_inst_queue_entry->lgkm_cnt && !warp->warp_inst_queue_entry->vm_cnt) { warp->warp_inst_queue_entry->wait_for_mem = 0; } else { /* TODO Show a waiting state in visualization * tool */ /* XXX uop is already freed */ return; } } /* Warp is ready but waiting at barrier */ if (warp->warp_inst_queue_entry->wait_for_barrier) { /* TODO Show a waiting state in visualization tool */ /* XXX uop is already freed */ return; } /* If fetch buffer full */ if (list_count(sm->fetch_buffers[wiq_id]) == frm_gpu_fe_fetch_buffer_size) return; /* Emulate instruction */ frm_warp_execute(warp); warp_inst_queue_entry = warp->warp_inst_queue_entry; warp_inst_queue_entry->ready = 0; /* Create uop */ uop = frm_uop_create(); uop->warp = warp; uop->thread_block = warp->thread_block; uop->sm = sm; uop->id_in_sm = sm->uop_id_counter++; uop->id_in_warp = warp->uop_id_counter++; uop->warp_inst_queue_id = wiq_id; uop->vector_mem_read = warp->vector_mem_read; uop->vector_mem_write = warp->vector_mem_write; uop->lds_read = warp->lds_read; uop->lds_write = warp->lds_write; uop->warp_inst_queue_entry = warp->warp_inst_queue_entry; uop->warp_last_inst = warp->finished; uop->mem_wait_inst = warp->mem_wait; uop->barrier_wait_inst = warp->barrier; uop->inst = warp->inst; uop->cycle_created = asTiming(frm_gpu)->cycle; assert(warp->thread_block && uop->thread_block); /* Debug */ //frm_inst_dump(inst_str, sizeof inst_str, // warp->grid->inst_buffer, warp->pc / 8); /* Trace */ if (frm_tracing()) { //frm_inst_dump(&warp->inst, warp->inst_size, // warp->pc, // warp->grid->inst_buffer + warp->pc, // inst_str, sizeof inst_str); str_single_spaces(inst_str_trimmed, sizeof inst_str_trimmed, inst_str); frm_trace("si.new_inst id=%lld cu=%d ib=%d wg=%d " "wf=%d uop_id=%lld stg=\"f\" asm=\"%s\"\n", uop->id_in_sm, sm->id, uop->warp_inst_queue_id, uop->thread_block->id, warp->id, uop->id_in_warp, inst_str_trimmed); } /* Update last memory accesses */ for (thread_id = uop->warp->threads[0]->id_in_warp; thread_id < uop->warp->thread_count; thread_id++) { thread = uop->warp->threads[thread_id]; thread_uop = &uop->thread_uop[thread->id_in_warp]; /* Global memory */ thread_uop->global_mem_access_addr = thread->global_mem_access_addr; thread_uop->global_mem_access_size = thread->global_mem_access_size; /* LDS */ thread_uop->lds_access_count = thread->lds_access_count; for (j = 0; j < thread->lds_access_count; j++) { thread_uop->lds_access_kind[j] = thread->lds_access_type[j]; thread_uop->lds_access_addr[j] = thread->lds_access_addr[j]; thread_uop->lds_access_size[j] = thread->lds_access_size[j]; } } /* Access instruction cache. Record the time when the * instruction will have been fetched, as per the latency * of the instruction memory. */ uop->fetch_ready = asTiming(frm_gpu)->cycle + frm_gpu_fe_fetch_latency; /* Insert into fetch buffer */ list_enqueue(sm->fetch_buffers[wiq_id], uop); instructions_processed++; sm->inst_count++; }