void FE_stage() { /* only part of FE_stage function is implemented */ /* please complete the rest of FE_stage function */ if(stop_fetching) return; if(FE_latch->op_valid || FE_latch->pipeline_stall_enabled) { /* Data inside the latch is valid and next stage is still using it. Or ID stage has enabled pipeline stalling because of a branch instruction. Do not fetch */ return; } Op *op = get_free_op(); bool next_op_exists = get_op(op); if(!next_op_exists) /* Indicate the end_of_trace by the FE cycle */ { free_op(op); op = get_free_op(); op->is_last_instruction = true; stop_fetching = true; } /* hwsim : get the instruction and pass to ID phase */ FE_latch->op = op; /* pass the op to ID stage */ FE_latch->op_valid = true; /* Mark it as valid */ FE_latch->pipeline_stall_enabled = false; /* Disable pipeline stall, if any */ // next_pc = pc + op->inst_size; // you need this code for building a branch predictor }
Op *copy_op(Op *copyop) { /* Get a free op from pool and return copied op */ Op * op = get_free_op(); op->num_src = copyop->num_src; op->src[0] = copyop->src[0]; op->src[1] = copyop->src[1]; op->dst = copyop->dst; op->opcode = copyop->opcode; op->is_fp = copyop->is_fp; op->cf_type = copyop->cf_type; op->mem_type = copyop->mem_type; op->write_flag = copyop->write_flag; op->inst_size = copyop->inst_size; op->ld_vaddr = copyop->ld_vaddr; op->st_vaddr = copyop->st_vaddr; op->instruction_addr = copyop->instruction_addr; op->branch_target = copyop->branch_target; op->actually_taken = copyop->actually_taken; op->mem_read_size = copyop->mem_read_size; op->mem_write_size = copyop->mem_write_size; op->valid = copyop->valid; /* you might add more features here */ op->is_last_instruction = copyop->is_last_instruction; op->wait_till_cycle = copyop->wait_till_cycle; op->is_waiting = copyop->is_waiting; }
void FE_stage() { /* only part of FE_stage function is implemented */ /* please complete the rest of FE_stage function */ if(FE_latch->stage_stall==false) //check if no pipeline stalled { Op* new_op = get_free_op(); //get a placeholder op out of the pool if(get_op(new_op)) //copy op from trace into the placeholder op { if((new_op->opcode>=OP_FMEM)&&(new_op->opcode<=OP_FCMO)){ float_count++; } if((new_op->opcode>=OP_IADD)&&(new_op->opcode<=OP_MM)){ integer_count++; } if((new_op->opcode>=OP_CF)&&(new_op->opcode<=OP_LDA)){ if(new_op->opcode==OP_CF) branch_count++; integer_count++; } if((new_op->opcode>=OP_LDA)&&(new_op->opcode<=OP_LD)){ load_count++; } if((new_op->opcode==OP_MM)||(new_op->opcode==OP_IMUL)){ multiple_count++; } if(new_op->opcode==OP_ST){ store_count++; } if((new_op->opcode == OP_CF) && new_op->cf_type==CF_CBR && use_bpred==true) { uint64_t pc = new_op->instruction_addr; bool predicted_dir=0; predicted_dir=bpred_access(branchpred,pc); bpred_update(branchpred,pc,predicted_dir,new_op->actually_taken); if(new_op->actually_taken != predicted_dir) { new_op->mispredicted_branch=true; } bpred_mispred_count=branchpred->mispred; bpred_okpred_count=branchpred->okpred; } FE_latch->op=new_op; FE_latch->op_valid=true; } else free_op(new_op); } // next_pc = pc + op->inst_size; // you need this code for building a branch predictor }
void FE_stage() { /* FE latch valid flag has to be FALSE only when there is a cd_stall AND not other stalls. Because only in a cd_stall, ID stage has to continue processing the instructions and move them to the next latch. Whereas when there are other stalls(ex_stall, dd_stall etc.) alongwith cd_stall, ID stage is also stalled. */ fe_ctr++; if(fe_ctr>1000) // If simulator waits for an op for > 1000 cycles, terminate the program { //std::cout<<"****************Detected a loop. Terminating the simulator***************"<<endl; exit(1); } // std::cout<<"stall flags are: mem_latency_stall="<<mem_latency_stall<<"mshr stall="<<mshr_stall<<"cd stall="<<cd_stall<<"dd stall="<<dd_stall<<"ex stall="<<ex_stall<<"fe stall="<<fe_stall<<endl; if(mem_latency_stall==FALSE && mshr_stall==FALSE && cd_stall==FALSE && dd_stall==FALSE && ex_stall==FALSE && fe_stall==FALSE) { if(FE_latch->op_valid==FALSE) // dont overwrite instrn in FE latch since ID has not been used { Op *fe_op = get_free_op(); // op currently in FE_stage bool success=get_op(fe_op); if(success==FALSE) //last instruction { fe_op->last_op=TRUE; fe_stall=TRUE; } FE_latch->op=fe_op; FE_latch->op_valid=TRUE; fe_ctr=0; // Instruction was successfully read, so reset the loop detection counter return; } } // next_pc = pc + op->inst_size; // you need this code for building a branch predictor }
/* send end of stream instruction in the pipeline */ bool sendEOS() { int i; for(i = 0 ; i < thread_count ; i++) { if(!FE_latch->op_valid_thread[i] && !FE_latch->pipeline_stall_enabled_thread[i]) break; } if(i == thread_count) { /* all slots are full */ return false; } /* can put a last_instruction in stream 'i' */ Op * op = get_free_op(); op->is_last_instruction = true; op->thread_id = i; FE_latch->op_queue[i] = op; FE_latch->op_valid_thread[i] = true; return true; }
void MEM_stage(memory_c *main_memory) // please modify MEM_stage function argument /** NEW-LAB2 */ { if((MEM_latch->op_valid) || (!EX_latch->op_valid)) return; Op *op = EX_latch->op; int threadid = op->thread_id; /* keep this 0 for LAB 3 */ uint64_t effective_latency = dcache_latency - 1; if(!((op->mem_type > NOT_MEM) && (op->mem_type < NUM_MEM_TYPES))) // Not a memory op { MEM_latch->op = op; // deprecated MEM_latch->oplist.push_back(op); MEM_latch->op_valid = true; EX_latch->op_valid = false; return; } UINT64 ac_addr = (op->mem_type == MEM_ST) ? op->st_vaddr : op->ld_vaddr; if(pteaddr_broadcasted) { /* We have all translations now. Set up the op to do cache access so that actual data is now loaded */ uint64_t vpn = ac_addr / KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); uint64_t index = ac_addr % KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); uint64_t pteaddr = vmem_get_pteaddr(vpn,threadid); uint64_t pfn = vmem_vpn_to_pfn(vpn,threadid); ac_addr = (pfn * KNOB(KNOB_VMEM_PAGE_SIZE)->getValue()) | index; if(op->mem_type == MEM_ST) op->st_vaddr = ac_addr; else if(op->mem_type == MEM_LD) op->ld_vaddr = ac_addr; EX_latch->pipeline_stall_enabled = false; pteaddr_broadcasted = false; goto dcache_access_for_data; } if(vmem_enabled && (op->mem_type > NOT_MEM) && (op->mem_type < NUM_MEM_TYPES)) { /* Making tlb access just to test if we get a hit. This is not a real TLB access. It is done later. This is just to take care of dcache_latency */ uint64_t tvaddr = (op->mem_type == MEM_ST) ? op->st_vaddr : op->ld_vaddr; uint64_t tvpn = ac_addr / KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); uint64_t tpfn; if(!tlb_access(dtlb,tvpn,threadid,&tpfn)) effective_latency = 2 * (dcache_latency - 1); } /* If it is a memory instruction, wait for dcache latency cycles */ if((op->mem_type > NOT_MEM) && (op->mem_type < NUM_MEM_TYPES)) { if(op->is_waiting) { if(cycle_count < op->wait_till_cycle) // op should remain blocked for dcache access latency return; op->is_waiting = false; // op completed wait for dcache access latency } else { if(!returning_on_mshr_full && !pteop_returning_on_mshr_full) { op->wait_till_cycle = cycle_count + effective_latency; // new op - set a deadline op->is_waiting = true; // order it to wait return; } } } uint64_t tvpn,tpfn,tpteaddr,tindex,tphysical_addr; if(pteop_returning_on_mshr_full) { tvpn = ac_addr / KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); tindex = ac_addr % KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); tpteaddr = vmem_get_pteaddr(tvpn,threadid); bool b = tlb_access(dtlb,tvpn,threadid,&tpfn); if(b) { dtlb_hit_count++; dtlb_hit_count_thread[threadid]++; } else { dtlb_miss_count++; dtlb_miss_count_thread[threadid]++; } if(b) { /* Got address translation in TLB */ tphysical_addr = (tpfn * KNOB(KNOB_VMEM_PAGE_SIZE)->getValue()) | tindex; ac_addr = tphysical_addr; if(op->mem_type == MEM_ST) op->st_vaddr = ac_addr; else if(op->mem_type == MEM_LD) op->ld_vaddr = ac_addr; /* Remove the flag that indicates that insert_mshr failed */ pteop_returning_on_mshr_full = false; /* Unblock the stall as it is not applicable any more */ EX_latch->pipeline_stall_enabled = false; goto dcache_access_for_data; } else if(dcache_access(tpteaddr)) { /* we got a cache hit on address translation */ dcache_hit_count++; dcache_hit_count_thread[threadid]++; cache_update(data_cache,tpteaddr); /* we got the pfn from dcache. Here, we get it using vpn_to_pfn translation function */ tpfn = vmem_vpn_to_pfn(tvpn,threadid); tphysical_addr = (tpfn * KNOB(KNOB_VMEM_PAGE_SIZE)->getValue()) | tindex; tlb_install(dtlb,tvpn,threadid,tpfn); /* change the address accessed in cache as well as change mem request address in case there is a cache miss */ ac_addr = tphysical_addr; if(op->mem_type == MEM_ST) op->st_vaddr = ac_addr; else if(op->mem_type == MEM_LD) op->ld_vaddr = ac_addr; /* Remove the flag that indicates that insert_mshr failed */ pteop_returning_on_mshr_full = false; /* Unblock the stall as it is not applicable any more */ EX_latch->pipeline_stall_enabled = false; goto dcache_access_for_data; // add if needed } else { /* We got a cache miss for the address translation. We will need to look up Page Table Entry in dram */ dcache_miss_count++; dcache_miss_count_thread[threadid]++; /* We need to stall the pipeline as we want to make dram access for PTE */ EX_latch->pipeline_stall_enabled = true; /* We also need a dummy load op that will go into memory */ Op * pteop = get_free_op(); pteop->is_pteop = true; pteop->mem_type = MEM_LD; pteop->ld_vaddr = tpteaddr; pteop->mem_read_size = VMEM_PTE_SIZE; pteop->vpn = tvpn; if(main_memory->store_load_forwarding(pteop)) { tpfn = vmem_vpn_to_pfn(tvpn,threadid); tphysical_addr = (tpfn * KNOB(KNOB_VMEM_PAGE_SIZE)->getValue()) | tindex; tlb_install(dtlb,tvpn,threadid,tpfn); cache_update(data_cache,tpteaddr); ac_addr = tphysical_addr; if(op->mem_type == MEM_ST) op->st_vaddr = ac_addr; else if(op->mem_type == MEM_LD) op->ld_vaddr = ac_addr; /* Remove the flag that indicates that insert_mshr failed */ pteop_returning_on_mshr_full = false; /* Unblock the stall as it is not applicable any more */ EX_latch->pipeline_stall_enabled = false; goto dcache_access_for_data; // add if needed } else if(main_memory->check_piggyback(pteop)) { pteop_returning_on_mshr_full = false; EX_latch->pipeline_stall_enabled = true; return; } else if(main_memory->insert_mshr(pteop)) { pteop_returning_on_mshr_full = false; EX_latch->pipeline_stall_enabled = true; return; } else { dtlb_miss_count--; dtlb_miss_count_thread[threadid]--; pteop_returning_on_mshr_full = true; EX_latch->pipeline_stall_enabled = true; free_op(pteop); return; } } } /* If we came back here due to mshr was full during first attempt of getting translation, and if the translation is not yet available, then we should not execute further part of the function */ if(EX_latch->pipeline_stall_enabled) return; /* Op has completed its wait for dcache latency amount of cycles */ if(returning_on_mshr_full) { UINT64 ac_addr = (op->mem_type == MEM_ST) ? op->st_vaddr : op->ld_vaddr; if(dcache_access(ac_addr)) { /* we got a cache hit - pass op to WB stage*/ dcache_hit_count++; dcache_hit_count_thread[threadid]++; cache_update(data_cache,ac_addr); MEM_latch->op = op; // deprecated MEM_latch->oplist.push_back(op); MEM_latch->op_valid = true; EX_latch->op_valid = false; /* will help in handling Case #2 hit under miss */ returning_on_mshr_full = false; // XXX : check validity - added in lab 3 return; } if(main_memory->insert_mshr(op)) { /* added successfully into mshr */ EX_latch->op_valid = false; returning_on_mshr_full = false; return; } else { returning_on_mshr_full = true; return; // MSHR is full - wait for next cycle } } ac_addr = (op->mem_type == MEM_ST) ? op->st_vaddr : op->ld_vaddr; UINT64 physical_addr; uint64_t vpn,pfn,pteaddr,index; /* If we are using the virtual memory -> then access TLB. This is a real TLB access where we get data and decide whether to access dcache. Cache is indexed using physical addresses now */ if(vmem_enabled && (op->mem_type > NOT_MEM) && (op->mem_type < NUM_MEM_TYPES) && !pteop_returning_on_mshr_full) { vpn = ac_addr / KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); index = ac_addr % KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); bool b = tlb_access(dtlb,vpn,threadid,&pfn); if(b) { dtlb_hit_count++; dtlb_hit_count_thread[threadid]++; } else { dtlb_miss_count++; dtlb_miss_count_thread[threadid]++; } if(b) { /* Got address translation in TLB */ physical_addr = (pfn * KNOB(KNOB_VMEM_PAGE_SIZE)->getValue()) | index; /* change the address accessed in cache as well as change mem request address in case there is a cache miss */ ac_addr = physical_addr; if(op->mem_type == MEM_ST) op->st_vaddr = ac_addr; else if(op->mem_type == MEM_LD) op->ld_vaddr = ac_addr; EX_latch->pipeline_stall_enabled = false; /* No need to do anything else. Access dcache to get actual data GOTO dcache_access_for_data; // add if needed */ } else { /* We have a miss in TLB. Must access cache / page table in memory to get the address translation */ /* We have to get the PTE address first */ vpn = ac_addr / KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); index = ac_addr % KNOB(KNOB_VMEM_PAGE_SIZE)->getValue(); pteaddr = vmem_get_pteaddr(vpn,threadid); if(dcache_access(pteaddr)) { /* we got a cache hit on address translation */ dcache_hit_count++; dcache_hit_count_thread[threadid]++; cache_update(data_cache,pteaddr); /* we got the pfn from dcache. Here, we get it using vpn_to_pfn translation function */ pfn = vmem_vpn_to_pfn(vpn,threadid); physical_addr = (pfn * KNOB(KNOB_VMEM_PAGE_SIZE)->getValue()) | index; tlb_install(dtlb,vpn,threadid,pfn); /* change the address accessed in cache as well as change mem request address in case there is a cache miss */ ac_addr = physical_addr; if(op->mem_type == MEM_ST) op->st_vaddr = ac_addr; else if(op->mem_type == MEM_LD) op->ld_vaddr = ac_addr; EX_latch->pipeline_stall_enabled = false; /* No need to do anything else. Access dcache to get actual data GOTO dcache_access_for_data; // add if needed */ } else { /* We got a cache miss for the address translation. We will need to look up Page Table Entry in dram */ dcache_miss_count++; dcache_miss_count_thread[threadid]++; /* We also need a dummy load op that will go into memory */ Op * pteop = get_free_op(); pteop->is_pteop = true; pteop->mem_type = MEM_LD; pteop->ld_vaddr = pteaddr; pteop->mem_read_size = VMEM_PTE_SIZE; pteop->vpn = vpn; if(main_memory->store_load_forwarding(pteop)) { /* we got MSHR hit on store load forwarding */ pfn = vmem_vpn_to_pfn(vpn,threadid); physical_addr = (pfn * KNOB(KNOB_VMEM_PAGE_SIZE)->getValue()) | index; cache_update(data_cache,pteaddr); tlb_install(dtlb,vpn,threadid,pfn); ac_addr = physical_addr; if(op->mem_type == MEM_ST) op->st_vaddr = ac_addr; else if(op->mem_type == MEM_LD) op->ld_vaddr = ac_addr; EX_latch->pipeline_stall_enabled = false; /* No need to do anything else. Access dcache to get actual data GOTO dcache_access_for_data; // add if needed */ } else if(main_memory->check_piggyback(pteop)) { /* We need to stall the pipeline as we want to make dram access for PTE */ EX_latch->pipeline_stall_enabled = true; return; } else if(main_memory->insert_mshr(pteop)) { /* We need to stall the pipeline as we want to make dram access for PTE */ EX_latch->pipeline_stall_enabled = true; return; } else { EX_latch->pipeline_stall_enabled = true; pteop_returning_on_mshr_full = true; free_op(pteop); return; } } } } /* Check if we get the hit */ dcache_access_for_data : if(dcache_access(ac_addr)) { /* we got a cache hit - pass op to WB stage*/ dcache_hit_count++; dcache_hit_count_thread[threadid]++; cache_update(data_cache,ac_addr); MEM_latch->op = op; // deprecated MEM_latch->oplist.push_back(op); MEM_latch->op_valid = true; EX_latch->op_valid = false; /* will help in handling Case #2 hit under miss */ return; } /* We got a cache miss */ dcache_miss_count++; dcache_miss_count_thread[threadid]++; /* Store Load Forwarding */ if(main_memory->store_load_forwarding(op)) { /* We got MSHR Hit in store load forwarding */ store_load_forwarding_count++; store_load_forwarding_count_thread[threadid]++; MEM_latch->op = op; // deprecated MEM_latch->oplist.push_back(op); MEM_latch->op_valid = true; EX_latch->op_valid = false; return; } /* Check if there is block hit for inst already present in MSHR - Case #4 MSHR HIT*/ else if(main_memory->check_piggyback(op)) { /* instruction piggybacked - allow EX to send next instruction */ EX_latch->op_valid = false; return; } else { /* cache & MSHR miss - add into mshr */ if(main_memory->insert_mshr(op)) { /* added successfully into mshr */ EX_latch->op_valid = false; returning_on_mshr_full = false; return; } else { returning_on_mshr_full = true; return; // MSHR is full - wait for next cycle } } return; }
void FE_stage() { if(stop_fetching) return; if(have_to_send_EOS) { if(sendEOS()) { stop_fetching = true; have_to_send_EOS = false; } return; } #if 0 if(FE_latch->op_valid || FE_latch->pipeline_stall_enabled) { /* Data inside the latch is valid and next stage is still using it. Or ID stage has enabled pipeline stalling because of a branch instruction. Do not fetch */ return; } /* This condition is rewritten for multithreading. See following statements. ~(a OR b) ===> ~a AND ~b */ #endif static UINT64 fetch_arbiter = 0; int stream_id = -1; Op *op; bool op_exists = false, stalled[HW_MAX_THREAD]; for(int i = 0 ; i < HW_MAX_THREAD ; i++) stalled[i] = true; /* Find next available empty queue slot to fill */ for(int i = 0 ; i < thread_count ; i++) { stream_id = fetch_arbiter++ % thread_count; if(!FE_latch->op_valid_thread[stream_id] && !FE_latch->pipeline_stall_enabled_thread[stream_id]) { stalled[stream_id] = false; op = get_free_op(); op_exists = get_op(op, stream_id); if(op_exists) break; else free_op(op); } } if(!op_exists) { /* No op fetched - this could be due to following : 1. all threads were stalled 2. some threads were stalled and others have run out of instructions 3. no instructions available to fetch */ // checking case 1 bool all_stalled = true; for(int i = 0 ; i < thread_count ; i++) { if(!stalled[i]) all_stalled = false; } if(all_stalled) return; // checking case 2 & 3 bool eois = true; // end of instruction streams for(int i = 0 ; i < thread_count ; i++) { if(!end_of_stream[i]) eois = false; } if(!eois) return; else { /* Must take actions for initiating simulator shut down */ // first it should be seen if there is some space in queue. // if no, then try to send in next cycle if(sendEOS()) stop_fetching = true; else have_to_send_EOS = true; return; } } /* If the op is an branch other than conditional branch, assume that it is predicted correctly, if the branch predictor is used */ // if(use_bpred && (op->cf_type >= CF_BR) && (op->cf_type < NUM_CF_TYPES) && (op->cf_type != CF_CBR)) // bpred_okpred_count++; /* Above 2 lines commented because its not the way solution is implemented */ /* If we are using branch predictor and type of opcode is conditional branch, get a prediction and update GHR and PHT */ if(use_bpred && (op->cf_type == CF_CBR)) { int prediction = bpred_access(branchpred, op->instruction_addr, op->thread_id); if(prediction == op->actually_taken) { bpred_okpred_count++; bpred_okpred_count_thread[op->thread_id]++; } else { bpred_mispred_count++; bpred_mispred_count_thread[op->thread_id]++; /* stall the pipeline if we mispredict */ FE_latch->pipeline_stall_enabled_thread[op->thread_id] = true;; FE_latch->stall_enforcer_thread[op->thread_id] = op->inst_id; } bpred_update(branchpred,op->instruction_addr,prediction,op->actually_taken, op->thread_id); } /* hwsim : get the instruction and pass to ID phase */ # if 0 /* Deprecated after adding MT support */ FE_latch->op = op; /* pass the op to ID stage */ FE_latch->op_valid = true; /* Mark it as valid */ #endif FE_latch->op_queue[op->thread_id] = op; FE_latch->op_valid_thread[op->thread_id] = true; }