static CPU_EXECUTE( necv ) { nec_state_t *nec_state = get_safe_token(device); int prev_ICount; if (nec_state->halted) { nec_state->icount = 0; debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip); return; } while(nec_state->icount>0) { /* Dispatch IRQ */ if (nec_state->pending_irq && nec_state->no_interrupt==0) { if (nec_state->pending_irq & NMI_IRQ) external_int(nec_state); else if (nec_state->IF) external_int(nec_state); } /* No interrupt allowed between last instruction and this one */ if (nec_state->no_interrupt) nec_state->no_interrupt--; debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip); prev_ICount = nec_state->icount; nec_instruction[fetchop(nec_state)](nec_state); do_prefetch(nec_state, prev_ICount); } }
void nec_common_device::execute_run() { int prev_ICount; if (m_halted) { m_icount = 0; debugger_instruction_hook(this, (Sreg(PS)<<4) + m_ip); return; } while(m_icount>0) { /* Dispatch IRQ */ if (m_pending_irq && m_no_interrupt==0) { if (m_pending_irq & NMI_IRQ) external_int(); else if (m_IF) external_int(); } /* No interrupt allowed between last instruction and this one */ if (m_no_interrupt) m_no_interrupt--; debugger_instruction_hook(this, (Sreg(PS)<<4) + m_ip); prev_ICount = m_icount; (this->*s_nec_instruction[fetchop()])(); do_prefetch(prev_ICount); } }
int nec_execute(int cycles) { nec_state_t *nec_state = sChipsPtr; int prev_ICount; nec_state->icount = cycles; nec_state->cycles_remaining = cycles; if (nec_state->halted) { nec_state->icount = 0; //debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip); return cycles; } while((nec_state->icount>0) && (!nec_state->stop_run)) { /* Dispatch IRQ */ if (nec_state->pending_irq && nec_state->no_interrupt==0) { if (nec_state->pending_irq & NMI_IRQ) external_int(nec_state); else if (nec_state->IF) external_int(nec_state); } /* No interrupt allowed between last instruction and this one */ if (nec_state->no_interrupt) nec_state->no_interrupt--; //debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip); prev_ICount = nec_state->icount; nec_instruction[fetchop(nec_state)](nec_state); do_prefetch(nec_state, prev_ICount); } nec_state->cycles_total += cycles - nec_state->icount; nec_state->cycles_remaining = 0; nec_state->stop_run = 0; return (cycles - nec_state->icount); }
void prefetch_irritator(void *arg) { int i, rc, no_of_pages, tid , thread_no, tc, oper , number_of_operations; unsigned long long saved_seed, random_no , starting_address , memory_fetch_size; pthread_t ptid; unsigned char *start_addr; struct thread_context *th = (struct thread_context *)arg; struct ruleinfo *current_rule = th->current_rule; int cache_type = current_rule->tgt_cache; int cache_line_size = system_information.cinfo[cache_type].line_size; unsigned int loop_count ; long int offset; unsigned long long temp_storage = 0x1, temp_pattern = 0x1; /* * char *contig_mem[NUM_SEGS*SEG_SIZE/(16*M)]; Physically contiguous * memory pointer. memory_set_size variable gives total memory * allocated both are variables of global structure. */ thread_no = th->thread_no ; int pcpu = pcpus_thread_wise[thread_no]; tid = th->bind_to_cpu; /* Bind to the processor */ ptid = th->tid; /* PThread Id for this thread */ prefetch_streams = th->prefetch_streams; /* Number of prefetch streams for this thread. */ if (current_rule->testcase_type != PREFETCH_ONLY) { /* Set Thread Cancel Type as ASYNCHRONOUS */ pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); } #ifdef __HTX_LINUX__ /*printf(" Prefetch:calling htx_bind with pcpu=%d for thread_no= %d\n",pcpu,thread_no);*/ if(pcpu == -1){ pcpu = htx_bind_thread(tid, -1); rc = pcpu; pcpus_thread_wise[thread_no]=pcpu; if(pcpu < 0){ pcpus_thread_wise[thread_no]= -1; } } else { rc = htx_bind_thread(tid,pcpu); } #else rc = bindprocessor(BINDTHREAD, thread_self(), tid); #endif DEBUG_LOG("[%d] thread %d, binding to cpu %d \n",__LINE__,thread_no,tid); if(rc < 0) { #ifdef __HTX_LINUX__ if( rc == -2) { tot_thread_count --; sprintf(msg,"lcpu:%d(pcpu=%d) prefetch has been hot removed, thread will be terminating now tot_thread_count=%d\n",tid,pcpu,tot_thread_count); hxfmsg(&h_d, errno, HTX_HE_INFO, msg); pthread_exit(NULL); } else { sprintf(msg, "%d: Bindprocessor for prefetch irritator on lcpu:%d and corresponding pcpu:%d failed with rc=%d\n", __LINE__, tid,pcpu,rc); hxfmsg(&h_d, errno, HTX_HE_HARD_ERROR, msg); } #else sprintf(msg, "Binding to cpu:%d failed with errno: %d \n",tid,errno); hxfmsg(&h_d, errno, HTX_HE_SOFT_ERROR, msg); #endif } /* End of if */ else { /*sprintf(msg,"::physical cpu:%d for log cpu:%d\n",pcpu,tid); hxfmsg(&h_d, rc , HTX_HE_INFO, msg);*/ #ifdef DEBUG sprintf(msg,"[%d] Bindprocessor success [prefetch thread_bo %d]! cpu_no : %d , pthread id : 0x%x \n",__LINE__,thread_no,tid,ptid); hxfmsg(&h_d, errno, HTX_HE_INFO, msg); #endif } th->seedval = time(NULL); srand48_r(th->seedval,&th->buffer); number_of_operations = current_rule->num_oper; starting_address = (unsigned long long)(th_array[thread_no].start_of_contiguous_memory); memory_fetch_size = current_rule->prefetch_memory_size - BYTES_EXC ; loop_count = memory_fetch_size / cache_line_size; for (oper = 0; oper < number_of_operations ; oper++) { /* if SIGTERM was received, exit */ if(exit_flag != 0) { break; } random_no = get_random_number_perf(thread_no); random_no = (unsigned long long)(random_no<<32) | (random_no); /*random_no = 0xaabbccdd; random_no = th_array[thread_no].random_pattern;*/ th_array[thread_no].prev_seed = random_no; /* Now write DSCR if needed */ if ( system_information.pvr >= POWER8_MURANO ) { prefetch_randomise_dscr(random_no, th->current_rule->pf_dscr , thread_no); } if (th_array[thread_no].prefetch_algorithm == RR_ALL_ENABLED_PREFETCH_ALGORITHMS) { /* Run all the enabled prefetch variants in round robin method */ /* If prefetch nstride is set in the current prefetch configuration */ if ( (PREFETCH_NSTRIDE & current_rule->pf_conf) == PREFETCH_NSTRIDE ) { n_stride(starting_address,memory_fetch_size,random_no,&th_array[thread_no].prefetch_scratch_mem[0]); } /* if SIGTERM was received, exit */ if (exit_flag != 0) { break; } /* If prefetch partial is set in the current prefetch configuration */ if ( (PREFETCH_PARTIAL & current_rule->pf_conf) == PREFETCH_PARTIAL ) { partial_dcbt(starting_address,memory_fetch_size,random_no,&th_array[thread_no].prefetch_scratch_mem[0]); } /* if SIGTERM was received, exit */ if (exit_flag != 0) { break; } if ( (PREFETCH_IRRITATOR & current_rule->pf_conf) == PREFETCH_IRRITATOR ) { rc = do_prefetch( starting_address , memory_fetch_size , random_no, thread_no, loop_count, th_array[thread_no].pattern); if ( rc != 0 ) { sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n" ,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size); hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg); dump_miscompare_data(thread_no, (unsigned char *)starting_address); return; } /*prefetch(starting_address,memory_fetch_size,random_no,&th_array[thread_no].prefetch_scratch_mem[0]);*/ } /* if SIGTERM was received, exit */ if (exit_flag != 0) { break; } if( (PREFETCH_TRANSIENT & current_rule->pf_conf) == PREFETCH_TRANSIENT ) { /*lrand48_r(&th->buffer, &offset);*/ offset = random_no % (long)16; start_addr = (unsigned char *)starting_address + offset; rc = transient_dcbt((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern ); if ( rc != 0 ) { sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n" ,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size); hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg); dump_miscompare_data(thread_no, (unsigned char *)starting_address); return; } } /* if SIGTERM was received, exit */ if (exit_flag != 0) { break; } if ( (PREFETCH_NA & current_rule->pf_conf) == PREFETCH_NA ) { /*lrand48_r(&th->buffer, &offset);*/ offset = random_no % (long)16; start_addr = (unsigned char *)starting_address + offset; rc = prefetch_dcbtna((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern,&temp_storage,&temp_pattern); if ( rc != 0 ) { sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x copied data = %x0x, copied pattern = %x0x, thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n" ,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), temp_storage, temp_pattern, thread_no, starting_address, memory_fetch_size); hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg); dump_miscompare_data(thread_no, (unsigned char *)starting_address); return; } } if (exit_flag != 0) { break; } } else { /* Else Run only the specified algorithm */ /*starting_address = (unsigned long long)(th_array[thread_no].start_of_contiguous_memory); memory_fetch_size = current_rule->prefetch_memory_size - BYTES_EXC ;*/ if(th_array[thread_no].prefetch_algorithm == PREFETCH_NSTRIDE) { /*lrand48_r(&th->buffer, &random_no);*/ n_stride(starting_address, memory_fetch_size, random_no, &th_array[thread_no].prefetch_scratch_mem[0]); } else if(th_array[thread_no].prefetch_algorithm == PREFETCH_PARTIAL) { partial_dcbt(starting_address, memory_fetch_size, random_no, &th_array[thread_no].prefetch_scratch_mem[0]); } else if(th_array[thread_no].prefetch_algorithm == PREFETCH_TRANSIENT) { /*lrand48_r(&th->buffer, &offset);*/ offset = random_no % (long)16; start_addr = (unsigned char *)starting_address + offset; rc = transient_dcbt((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern ); if ( rc != 0 ) { sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n" ,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size); hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg); dump_miscompare_data(thread_no, (unsigned char *)starting_address); return; } } else if(th_array[thread_no].prefetch_algorithm == PREFETCH_IRRITATOR) { rc = do_prefetch( starting_address , memory_fetch_size , random_no, thread_no, loop_count, th_array[thread_no].pattern); if ( rc != 0 ) { sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n" ,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size); hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg); dump_miscompare_data(thread_no, (unsigned char *)starting_address); return; } } else if ( th_array[thread_no].prefetch_algorithm == PREFETCH_NA ) { /*lrand48_r(&th->buffer, &offset);*/ offset = random_no % (long)16; start_addr = (unsigned char *)starting_address + offset; rc = prefetch_dcbtna((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern,&temp_storage, &temp_pattern); if ( rc != 0 ) { sprintf(msg,"[%d] Miscompare in Prefetch ( returned %d)!! Expected data = 0x%x Actual data = 0x%x copied data = 0x%x, copied pattern = 0x%x, thread_index : 0x%x Start of memory = %p, offset = %d\n" ,__LINE__, rc, th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)start_addr + 128*(loop_count-rc)), temp_storage, temp_pattern, thread_no, starting_address, offset); hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg); dump_miscompare_data(thread_no, (unsigned char *)starting_address); return; } } /* if SIGTERM was received, exit */ if(exit_flag != 0) { break; } } } /* End of for loop */ #ifdef __HTX_LINUX__ /* Restore original/default CPU affinity so that it binds to ANY available processor */ rc = htx_unbind_thread(); #else rc = bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY); #endif if(rc == -1) { sprintf(msg, "%d: Unbinding from cpu:%d failed with errno %d \n",__LINE__, tid, errno); hxfmsg(&h_d, errno, HTX_HE_SOFT_ERROR, msg); } #if defined(__HTX_MAMBO__) || defined(AWAN) printf("[%d] Thread no: %d, completed passes : %d\n",__LINE__, thread_no, oper); #endif }
void qcdoc_su3_recon( char *name) { /**** This section defines all the registers and offsets I need ****/ /* * This marks the argument registers as defined by ABI as off limits * to us until they are freed by "getarg()"; */ int dum = defargcount(4); /*Handle for the loop entry point*/ int branchsite; int branchmu; int retno ; /*------------------------------------------------------------------ * Floating point registers *------------------------------------------------------------------ */ // Reconstruct 8 registers for 4 spinor // reg_array_2d(PSI,Fregs,4,2); reg_array_3d(PSI,Fregs,3,4,2); offset_3d(PSI_IMM,FourSpinType,4,3,2); /*Offsets within 4 spinor*/ // Reconstruct 2 spinor registers #define NEO 2 reg_array_3d(Atmp,Fregs,1,2,2); /*CHIplus regs */ reg_array_3d(Btmp,Fregs,1,2,2); /*CHIminus regs */ int A[NEO][2][2] = { Atmp[0][0][0], Atmp[0][0][1], Atmp[0][1][0], Atmp[0][1][1], -1,-1,-1,-1 }; int B[NEO][2][2] = { Btmp[0][0][0], Btmp[0][0][1], Btmp[0][1][0], Btmp[0][1][1], -1,-1,-1,-1 }; /*Regs for SU3 two spinor multiply ... overlap with the reconstruct*/ /* registers */ int CHIR[3][2][2] = { A[0][0][0],A[0][0][1], A[0][1][0],A[0][1][1], B[0][0][0],B[0][0][1], B[0][1][0],B[0][1][1], PSI[0][0][0],PSI[0][0][1], PSI[0][1][0],PSI[0][1][1] }; offset_3d(CHI_IMM,TwoSpinType,3,2,2); /*Registers for the gauge link (2 rows)*/ int UA[3][2] = { {PSI[0][2][0],PSI[0][2][1]}, {PSI[2][1][0],PSI[2][1][1]}, {PSI[1][0][0],PSI[1][0][1]} }; int UB[3][2] = { {PSI[1][1][0],PSI[1][1][1]}, {PSI[2][0][0],PSI[2][0][1]}, {PSI[1][2][0],PSI[1][2][1]}, }; offset_3d(GIMM , GaugeType, 3, 3 ,2 ); // Other 8 registers used for reduction variables in SU3. // Could use these in reconstruct?? int E[2] = { PSI[2][2][0],PSI[2][2][1]}; /* * FCD used for drain of Chi * Overlap with PSI[*][3][*] */ int F[2] = {PSI[0][3][0],PSI[0][3][1]}; int C[2] = {PSI[1][3][0],PSI[1][3][1]}; int D[2] = {PSI[2][3][0],PSI[2][3][1]}; /* * Integer registers */ alreg(psi,Iregs); alreg(Umu,Iregs); alreg(Ufetch,Iregs); alreg(Chiin,Iregs); alreg(Chiout,Iregs); alreg(Chifetch,Iregs); reg_array_1d(Chiplus,Iregs,4);/*Pointers to the 8 2-spinors for recombination*/ reg_array_1d(Chiminus,Iregs,4); alreg(mu,Iregs); alreg(Chidrain,Iregs); alreg(pref,Iregs); alreg(mem,Iregs); alreg(length,Iregs); int Isize = PROC->I_size; int Fsize = PROC->FP_size; def_off( ZERO_IMM, Byte,0); def_off( PSI_ATOM, FourSpinType, 24); def_off( CHI_ATOM, TwoSpinType, 12); def_off( PAD_CHI_ATOM, TwoSpinType, 16); def_off( MAT_IMM, GaugeType, 18); int Ndim = def_offset(4,Byte,"Ndim"); int Ndimm1 = def_offset(3,Byte,"Ndimm1"); int hbias,bias; /*Offsets handles to stack*/ int hbitbucket = def_offset(16*Isize,Byte,"hbitbucket"); int Tsize; if ( TwoSpinType == Double ) Tsize = PROC->FP_size; else Tsize = PROC->FSP_size; int hstk0 = def_offset(16*Isize+12*Tsize ,Byte,"hstk0"); int hstk1 = def_offset(16*Isize+2*12*Tsize,Byte,"hstk1"); int hstk2 = def_offset(16*Isize+3*12*Tsize,Byte,"hstk2"); int hstk3 = def_offset(16*Isize+4*12*Tsize,Byte,"hstk3"); int hIsize = def_offset(Isize,Byte,"Isize"); int i,co,j,k,nxt,ri,sp,nxtco,eop,eo_a,eo_b; /***********************************************************************/ /* * PROLOGUE */ make_inst(DIRECTIVE,Enter_Routine,name); /*Allocate stack save any callee save registers we need etc...*/ int stack_buf_size; stack_buf_size = 16*Isize + 12*Fsize * 5 ; hbias = grab_stack(stack_buf_size); bias = get_offset(hbias); save_regs(); queue_iadd_imm(mem,PROC->StackPointer,hbias); /*Pointer to buf on stack*/ /*Define our arguments - all pointers ala fortran*/ getarg(psi); getarg(Umu); getarg(Chiin); getarg(length); /*{... Process arguments ...*/ queue_iload(length,ZERO_IMM,length); /*Load in sx counter*/ retno = get_target_label(); /*Branch to exit if yzt <1*/ check_iterations(length,retno); need_cache_line(0); need_cache_line(1); need_cache_line(2); need_cache_line(3); need_cache_line(4); pragma(DCBT_SPACE,5); pragma(DCBT_POST,1); #define LOAD_U(comin,comax)\ /*Load two link rows*/\ for( i = comin;i<=comax;i++ ){\ for( ri=0;ri<2;ri++){ \ queue_fload(UA[i][ri],GIMM[i][0][ri],Umu,GaugeType);\ queue_fload(UB[i][ri],GIMM[i][1][ri],Umu,GaugeType);\ } \ } #define PRELOAD_U LOAD_U(0,1) #define POSTLOAD_U LOAD_U(2,2) PRELOAD_U #define LOAD_CHI(comin,comax) \ /*Load Chi column*/\ for( i = comin;i<=comax;i++ ){\ for( ri=0;ri<2;ri++){\ queue_fload(CHIR[i][0][ri],CHI_IMM[i][0][ri],Chiin,TwoSpinType);\ } \ for( ri=0;ri<2;ri++){\ queue_fload(CHIR[i][1][ri],CHI_IMM[i][1][ri],Chiin,TwoSpinType);\ } \ } #define PRELOAD_CHI LOAD_CHI(0,1) #define POSTLOAD_CHI LOAD_CHI(2,2) #define POSTLOAD \ POSTLOAD_CHI \ POSTLOAD_U do_prefetch(Chiin,0); do_prefetch(Chiin,1); if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chiin,2); PRELOAD_CHI /* * Start site loop */ queue_iadd_imm(Chidrain,mem,hbitbucket); branchsite = start_loop(length); queue_iadd_imm(Chiout,mem,hstk0); /* * Loop over mu in asm */ queue_iload_imm(mu,Ndimm1); #define CHIDRAIN \ queue_fstore(F[0],CHI_IMM[1][1][0],Chidrain,TwoSpinType);\ queue_fstore(F[1],CHI_IMM[1][1][1],Chidrain,TwoSpinType);\ queue_fstore(C[0],CHI_IMM[2][0][0],Chidrain,TwoSpinType);\ queue_fstore(C[1],CHI_IMM[2][0][1],Chidrain,TwoSpinType);\ queue_fstore(D[0],CHI_IMM[2][1][0],Chidrain,TwoSpinType);\ queue_fstore(D[1],CHI_IMM[2][1][1],Chidrain,TwoSpinType); #define PREFETCH_CHI \ queue_iadd_imm(Chifetch,Chiin,PAD_CHI_ATOM);\ do_prefetch(Chifetch,0);\ do_prefetch(Chifetch,1);\ if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2); #define PREFETCH_CHIF \ queue_iadd_imm(Chifetch,Chifetch,PAD_CHI_ATOM);\ do_prefetch(Chifetch,0);\ do_prefetch(Chifetch,1);\ if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2); for ( int unroll=0; unroll<2; unroll++ ) { if ( unroll==0 ) { branchmu = start_loop(mu); pragma(DCBT_SPACE,5); pragma(STORE_LIM,1); pragma(LOAD_LIM,2); } else { pragma(STORE_LIM,2); pragma(DCBT_SPACE,5); pragma(DCBT_POST,1); pragma(DCBT_PRE,0); pragma(LOAD_LIM,2); } CHIDRAIN POSTLOAD if ( unroll == 0 ) { PREFETCH_CHI queue_iadd_imm(Ufetch,Umu,MAT_IMM); do_prefetch(Ufetch,0); do_prefetch(Ufetch,1); do_prefetch(Ufetch,2); if ( GaugeType == Double ) { do_prefetch(Ufetch,3); do_prefetch(Ufetch,4); } } else { pragma(DCBT_SPACE,3); PREFETCH_CHI PREFETCH_CHIF PREFETCH_CHIF PREFETCH_CHIF } j=0; queue_three_cmuls(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); j=1; queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); j=2; queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); /*Store the first three results*/ queue_fstore(C[0],CHI_IMM[0][0][0],Chiout,TwoSpinType); queue_fstore(C[1],CHI_IMM[0][0][1],Chiout,TwoSpinType); queue_fstore(D[0],CHI_IMM[0][1][0],Chiout,TwoSpinType); queue_fstore(D[1],CHI_IMM[0][1][1],Chiout,TwoSpinType); queue_fstore(E[0],CHI_IMM[1][0][0],Chiout,TwoSpinType); queue_fstore(E[1],CHI_IMM[1][0][1],Chiout,TwoSpinType); /*Load the third row*/ for(j=0; j<3; j++) { for(ri=0; ri<2; ri++) { queue_fload(UA[j][ri],GIMM[j][2][ri],Umu,GaugeType); } } /*Gauge layout is linear, mu faster than site*/ queue_iadd_imm(Umu,Umu,MAT_IMM); /*Now the second set of three cdots*/ j=0; queue_three_cmuls(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); j=1; queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); j=2; queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); /**************END SU3 CODE *************/ queue_iadd_imm(Chiin,Chiin,PAD_CHI_ATOM); queue_iadd_imm(Chidrain,Chiout,ZERO_IMM); queue_iadd_imm(Chiout,Chiout,CHI_ATOM); if ( unroll == 0 ) { PRELOAD_U PRELOAD_CHI } /*********************************************************/ /****************** END OF SU3 MULTIPLY ******************/ /*********************************************************/ if ( unroll== 0 ) { stop_loop(branchmu,mu); /* End loop over mu*/ make_inst(DIRECTIVE,Target,get_target_label() ); /*delineate the sections*/ } } /*********************************************************/ /****************** START OF RECONSTRUCT *****************/ /*********************************************************/ //Address calculation... // Chiminus -> Stack and ChiPlus -> Chiin pragma(STORE_INORDER,1); queue_iadd_imm(Chiminus[0],mem,hstk0); /*For register use reasons loop over colour outermost*/ #define LOAD_CHI_MU0(eo,co) \ for( sp = 0; sp<2;sp++ ){\ for( ri = 0; ri<2;ri++ ){\ queue_fload(A[eo][sp][ri],CHI_IMM[co][sp][ri],Chiminus[0],TwoSpinType);\ if ( co == 0 ) {\ queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiin,TwoSpinType);\ queue_iadd_imm(Chiplus[0],Chiin,ZERO_IMM);\ } else {\ queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiplus [0],TwoSpinType);\ }\ }} pragma(LOAD_LIM,2); LOAD_CHI_MU0(0,0) pragma(DCBT_POST,1); CHIDRAIN int neo_a = NEO; int neo_b = NEO; eo_a = 0; eo_b = 0; for ( co = 0; co <3 ; co ++ ) { pragma(LOAD_LIM,1); if ( co == 0 ) { // Use the third colour for unrolling the loads A[1][0][0] = PSI[2][0][0]; A[1][0][1] = PSI[2][0][1]; A[1][1][0] = PSI[2][1][0]; A[1][1][1] = PSI[2][1][1]; B[1][0][0] = PSI[2][2][0]; B[1][0][1] = PSI[2][2][1]; B[1][1][0] = PSI[2][3][0]; B[1][1][1] = PSI[2][3][1]; queue_iadd_imm(Chiminus[1],mem,hstk1); // This is invariant of loop // Take out queue_iadd_imm(Chiplus[1],Chiin ,PAD_CHI_ATOM); } /*************************************************************** * MU = 0 reconstruct * ****************************************************************/ if ( co == 2 ) { // Flip to not unrolled due to register pressure neo_b = 1; neo_a = 2; A[1][0][0] = PSI[0][0][0]; A[1][0][1] = PSI[0][0][1]; A[1][1][0] = PSI[1][0][0]; A[1][1][1] = PSI[1][0][1]; pragma(DCBT_POST,0); pragma(DCBT_SPACE,1); queue_iadd_imm(Ufetch,Umu,ZERO_IMM); // do_prefetch(Ufetch,0); do_prefetch(Ufetch,1); do_prefetch(Ufetch,2); if ( GaugeType == Double ) { do_prefetch(Ufetch,3); do_prefetch(Ufetch,4); } } /* psi_0 = Chiplus[0] + Chiminus[0] */ /* psi_1 = Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],B[eo_b][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],B[eo_b][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],B[eo_b][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],B[eo_b][1][1],A[eo_a][1][1]); // Dagger = 0: /* psi_2 =-iChiplus[1] +iChiminus[1] */ /* psi_3 =-iChiplus[0] +iChiminus[0] */ // Dagger = 1: /* psi_2 = iChiplus[1] -iChiminus[1] */ /* psi_3 = iChiplus[0] -iChiminus[0] */ if ( dagger == 0 ) { queue_fsub(PSI[co][2][0],B[eo_b][1][1],A[eo_a][1][1]); queue_fsub(PSI[co][2][1],A[eo_a][1][0],B[eo_b][1][0]); queue_fsub(PSI[co][3][0],B[eo_b][0][1],A[eo_a][0][1]); queue_fsub(PSI[co][3][1],A[eo_a][0][0],B[eo_b][0][0]); } else { queue_fsub(PSI[co][2][0],A[eo_a][1][1],B[eo_b][1][1]); queue_fsub(PSI[co][2][1],B[eo_b][1][0],A[eo_a][1][0]); queue_fsub(PSI[co][3][0],A[eo_a][0][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][1],B[eo_b][0][0],A[eo_a][0][0]); } /*************************************************************** * MU = 1 reconstruct * ****************************************************************/ eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[1],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [1],TwoSpinType); } } if ( co == 0 ) { queue_iadd_imm(Chiminus[2],mem,hstk2); queue_iadd_imm(Chiminus[3],mem,hstk3); queue_iadd_imm(Chiplus[2],Chiplus[1],PAD_CHI_ATOM); queue_iadd_imm(Chiplus[3],Chiplus[2],PAD_CHI_ATOM); } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); //Dagger == 0 /* psi_2 += Chiplus[1] - Chiminus[1] */ /* psi_3 += -Chiplus[0] + Chiminus[0] */ //Dagger == 1 /* psi_2 -= Chiplus[1] - Chiminus[1] */ /* psi_3 -= -Chiplus[0] + Chiminus[0] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]); queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]); queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]); } /*************************************************************** * MU = 2 reconstruct * ****************************************************************/ eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[2],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [2],TwoSpinType); } } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); //Dagger == 0 /* psi_2 +=-iChiplus[0] +iChiminus[0] */ /* psi_3 += iChiplus[1] -iChiminus[1] */ //Dagger == 1 /* psi_2 -=-iChiplus[0] +iChiminus[0] */ /* psi_3 -= iChiplus[1] -iChiminus[1] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]); queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]); queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]); } /*************************************************************** * MU = 3 reconstruct * ****************************************************************/ pragma(LOAD_LIM,2); eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[3],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [3],TwoSpinType ); } } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); //Dagger == 0 /* psi_2 += Chiplus[0] - Chiminus[0] */ /* psi_3 += Chiplus[1] - Chiminus[1] */ //Dagger == 1 /* psi_2 -= Chiplus[0] - Chiminus[0] */ /* psi_3 -= Chiplus[1] - Chiminus[1] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]); } queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); if ( dagger == 0 ) { queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]); } else { queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]); } /* * Store the spinors. If this is problematic * in terms of PEC WriteBuf misses, I could * store to the stack and copy out later. */ if ( co != 2 ) { LOAD_CHI_MU0(0,co+1) eo_a=0; eo_b=0; } queue_fstore(PSI[co][0][0],PSI_IMM[0][co][0],psi,FourSpinType); queue_fstore(PSI[co][0][1],PSI_IMM[0][co][1],psi,FourSpinType); } /* * Store out in linear order now */ pragma(STORE_LIM,2); pragma(DCBT_SPACE,8); for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][1][0],PSI_IMM[1][co][0],psi,FourSpinType); queue_fstore(PSI[co][1][1],PSI_IMM[1][co][1],psi,FourSpinType); } for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][2][0],PSI_IMM[2][co][0],psi,FourSpinType); queue_fstore(PSI[co][2][1],PSI_IMM[2][co][1],psi,FourSpinType); } if ( TwoSpinType == FourSpinType ) { queue_iadd_imm(Chidrain,psi,CHI_ATOM); } else { queue_iadd_imm(Chidrain,mem,hbitbucket); for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][3][0],PSI_IMM[3][co][0],psi,FourSpinType); queue_fstore(PSI[co][3][1],PSI_IMM[3][co][1],psi,FourSpinType); } } queue_iadd_imm(psi,psi,PSI_ATOM); /* * Put in an artificial dependency here * to try to stop the preloads getting above the last load of * reconstruct. */ queue_iadd_imm(Chiplus[3],Chiplus[3],ZERO_IMM); queue_iadd_imm(Chiin ,Chiplus[3],PAD_CHI_ATOM); pragma(DCBT_SPACE,0); do_prefetch(Chiin,0); do_prefetch(Chiin,1); if ( SizeofDatum(TwoSpinType) == 8 )do_prefetch(Chiin,2); PRELOAD_U PRELOAD_CHI /* TERMINATION point of the loop*/ stop_loop(branchsite,length); CHIDRAIN make_inst(DIRECTIVE,Target,retno); /* * * EPILOGUE * */ restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }
void touch(int addr, int line) { do_flush(addr,line); if ( dd2 ) l2_touch(addr,line); else do_prefetch(addr,line); }
void qcdoc_merge( char *name) { int dum = defargcount(5); /*Integer register usage*/ alreg(outptr,Iregs); alreg(vec1ptr,Iregs); alreg(vec2ptr,Iregs); alreg(counter,Iregs); /*Floating register usage*/ reg_array_1d(vec1,Cregs,3); reg_array_1d(vec2,Cregs,3); reg_array_1d(oreg,Cregs,6); alreg(permreg,Cregs); def_off(ZERO,SpinorType,0); def_off (IN_ATOM,SpinorType,6*nsimd()); // 2spins worth, 3 colors x complex def_off (OUT_ATOM,SpinorType,12*nsimd());// 2spins worth, 3 colors x complex x simd def_off(bits16,Byte,0xFFFF); def_off(thirtytwo,Byte,32); def_off(sixteen,Byte,16); offset_2d(CHI_IMM,SpinorType,6,2*nsimd()); int Isize = PROC->I_size; int word_size = def_offset(Isize,Byte,"word_size"); struct stream *PreOut; struct stream *PreVec1; struct stream *PreVec2; int brchno,retno; /*Branch target handles*/ int co; make_inst(DIRECTIVE,Enter_Routine,name); int bias = grab_stack(64); save_regs(); queue_iadd_imm(PROC->StackPointer,PROC->StackPointer,bias); getarg(outptr); /*Get args*/ getarg(vec1ptr); getarg(vec2ptr); getarg(counter); alreg(Mask,Iregs); alreg(Convert1,Iregs); alreg(Convert2,Iregs); int memory = PROC->StackPointer; for (int i =0; i<6; i++ ) { need_constant(i*2*SizeofDatum(SpinorType)*nsimd()); } need_constant(64); complex_simd_init(permreg); if ( half_precision ) { queue_iload_imm(Mask,ZERO); queue_ori(Mask,Mask,bits16); queue_lshift(Mask,Mask,thirtytwo); queue_ori(Mask,Mask,bits16); queue_lshift(Mask,Mask,sixteen); } /* * Insert a label to prevent reordering */ make_inst(DIRECTIVE,Target,get_target_label()); PreVec1= create_stream(IN_ATOM,vec1ptr ,counter,STREAM_IN ,LINEAR); PreVec2= create_stream(IN_ATOM,vec2ptr ,counter,STREAM_IN ,LINEAR); PreOut = create_stream(OUT_ATOM,outptr ,counter,STREAM_OUT ,LINEAR); /*Branch to stack restore if length <1*/ retno = get_target_label(); check_iterations(counter,retno); /* * Start software pipeline */ brchno = start_loop(counter); int indco[3]={0,1,2}; int permute_mu=3; for(int ico=0;ico<3;ico++){ co = indco[ico]; // Could do entirely in integer unit for half precision to accelerate this if ( half_precision ) { complex_load_half(vec1[co],CHI_IMM[co][0],vec1ptr,memory,Convert1,Convert2,Mask); complex_load_half(vec2[co],CHI_IMM[co][0],vec2ptr,memory,Convert1,Convert2,Mask); } else { complex_load(vec1[co],CHI_IMM[co][0],vec1ptr,SpinorType); complex_load(vec2[co],CHI_IMM[co][0],vec2ptr,SpinorType); } } { // Merge the vectors for(co=0;co<3;co++) complex_simd_merge (0,permute_mu,oreg[co*2] ,vec1[co],vec2[co]); for(co=0;co<3;co++) complex_simd_merge (1,permute_mu,oreg[co*2+1],vec1[co],vec2[co]); } // make_inst(DIRECTIVE,LS_BARRIER); for(int i=0;i<6;i++){ // 2 SIMD sites, 3 colors, 2 spins 2 complex == 24 floats if ( half_precision ) { complex_store_half(oreg[i],CHI_IMM[i][0],outptr,memory,Convert1,Convert2,Mask); } else { complex_store(oreg[i],CHI_IMM[i][0],outptr,SpinorType); } } iterate_stream(PreVec1); iterate_stream(PreVec2); do_prefetch(vec1ptr,0); do_prefetch(vec2ptr,0); do_prefetch(vec1ptr,1); do_prefetch(vec2ptr,1); iterate_stream(PreOut); stop_loop(brchno,counter); make_inst(DIRECTIVE,Target,retno); queue_isub_imm(PROC->StackPointer,PROC->StackPointer,bias); restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }