void stop_loop_usparcIIs(int branchno,int counter) { queue_load_addr(counter,Hdecrement,counter); /*Wasting one Iop here - could set condition code with addcc*/ queue_barrier(); make_inst(IALUPIPE,IOR_PREDICATE,counter,counter,counter); make_inst(BRCHPIPE,BRANCH_GT,branchno); }
void queue_cmovle_knc(int cond,int src,int dst) { int skipto = get_target_label(); make_inst(IALUPIPE,IOR_PREDICATE,cond,cond,cond); /*Sets the predicate reg*/ make_inst(BRCHPIPE,BRANCH_GT,skipto); /*Branch if cond > 0*/ make_inst(IALUPIPE,MOV,dst,src); /*Copy if cond <= 0 */ make_inst(DIRECTIVE,CmovTarget,skipto); }
void queue_cmovge_ppc440s(int cond,int src,int dst) { int skipto = get_target_label(); make_inst(IALUPIPE,IOR_PREDICATE,cond,cond,cond); /*Sets the predicate reg*/ make_inst(BRCHPIPE,BRANCH_LT,skipto); /*Branch if cntreg <= 0*/ make_inst(IALUPIPE,IOR,dst,src,src); make_inst(DIRECTIVE,CmovTarget,skipto); }
void queue_cmovlt_powerIII(int cond,int src,int dst) { int skipto = get_target_label(); make_inst(IALUPIPE,IOR_PREDICATE,cond,cond,cond); /*Sets the predicate reg*/ make_inst(BRCHPIPE,BRANCH_GE,skipto); /*Branch if cntreg >= 0*/ make_inst(IALUPIPE,IOR,dst,src,src); /*Copy if cond < 0 */ make_inst(DIRECTIVE,CmovTarget,skipto); }
int start_loop_usparcIIs(int counter) { (void)counter; // suppresses "unused parameter" compiler warning int lab = get_target_label(); Hdecrement = def_offset(-1,Byte,"minus1"); make_inst(BRCHPIPE,BRANCH,lab); make_inst(DIRECTIVE,Cache_Align,lab); make_inst(DIRECTIVE,Target,lab); return(lab); }
int start_loop_ppc440s(int counter) { int lab = get_target_label(); Hdecrement = def_offset(-1,Byte,"minus1"); innermost = 1; /*mtctr the register associated with counter*/ make_inst(CACHPIPE,COUNTER_HINT,counter); make_inst(BRCHPIPE,BRANCH,lab); make_inst(DIRECTIVE,Cache_Align,lab); make_inst(DIRECTIVE,Target,lab); return(lab); }
void stop_loop_powerIII(int branchno,int counter) { /* Use the auto decrementing branch if possible */ queue_load_addr(counter,Hdecrement,counter); if ( innermost ) { make_inst(BRCHPIPE,BRANCH_CTR,branchno); innermost = 0; } else { queue_barrier(); make_inst(IALUPIPE,IOR_PREDICATE,counter,counter,counter); /*Sets the predicate reg*/ make_inst(BRCHPIPE,BRANCH_GT,branchno); } return; }
int start_loop_knc(int counter) { (void)counter; // suppresses "unused parameter" compiler warning int lab = get_target_label(); /* Target label of head of loop */ Hdecrement = def_offset(-1,Byte,"minus1"); /* Defines counter decrement value */ make_inst(DIRECTIVE, Target, lab); return lab; }
void dwf_deriv( char *name) { /* * This marks the argument registers as defined by ABI as off limits * to us until they are freed by "getarg()"; */ int dum = defargcount(1); int retno; /* * S=phi^dag (MdagM)^-1 phi * * dS = phi^dag (MdagM)^-1 [ dMdag M + Mdag dM ] (MdagM)^-1 phi * * Let X = (MdagM)^-1 phi * Y = M X = M^-dag phi * * Want terms: Ydag dM X * Xdag dMdag Y * * Take Xdag 1-gamma Y * * Still a bit confused about the 1+g 1-g terms; but this may be simply a factor of two as we add +h.c. * Will continue to follow Chroma's routine */ reg_array_2d(Y,Cregs,4,3); // 4 spinor - 24 regs reg_array_2d(X,Cregs,4,3); // 4 spinor - 12 regs reg_array_1d(F,Cregs,3); // Force alreg(Z,Cregs); // Zero alreg(creg,Cregs); // Zero offset_3d(CHIIMM,FourSpinType,2,3,2*nsimd()); offset_3d(PSIIMM,FourSpinType,4,3,2*nsimd()); offset_3d(GIMM ,GaugeType, 3, 3 ,2*nsimd() ); def_off( GAUGE_SITE_IMM, FourSpinType,4*18*nsimd()); def_off( MAT_IMM , GaugeType,18*nsimd()); def_off( PSI_IMM , FourSpinType,24*nsimd()); def_off( CHI_IMM , FourSpinType,12*nsimd()); def_off( CONST_ZERO_OFFSET,Double,2*2*nsimd()); /* * Integer registers */ alreg(F_p,Iregs); /*Pointer to the current cpt of force field */ alreg(F_p_s,Iregs); alreg(Y_mu,Iregs); alreg(Y_p,Iregs); alreg(X_p,Iregs); alreg(length,Iregs); /*number of sites*/ alreg(tab,Iregs); /*Pointer to current entry in offset table*/ alreg(Complex_i,Iregs);/*Point to (0,1)x Nsimd*/ alreg(Ls,Iregs); alreg(s,Iregs); alreg(recbuf_base,Iregs); alreg(args,Iregs); alreg(s_offset,Iregs); /*Useful integer immediate constants, in units of Fsize*/ def_off( ZERO_IMM,Byte,0); def_off( minusone,Byte,-1); def_off( one,Byte,1); // Mask bits for predicating directions def_off( mask_0,Byte,1); def_off( mask_1,Byte,2); def_off( mask_2,Byte,4); def_off( mask_3,Byte,8); def_off( mask_4,Byte,16); def_off( mask_5,Byte,32); def_off( mask_6,Byte,64); def_off( mask_7,Byte,128); int mask_imm[8] = { mask_0, mask_1, mask_2, mask_3, mask_4, mask_5, mask_6, mask_7 }; alreg(mask ,Iregs); offset_1d(TAB_IMM,TableType,17); // Integer sizes int Isize = def_offset(PROC->I_size,Byte,"Isize"); int ISsize = def_offset(PROC->IS_size,Byte,"ISsize"); int i,j,co,sp; /*********************************************************************/ make_inst(DIRECTIVE,Enter_Routine,name); grab_stack(0); save_regs(); /********************************************* * our arguments ********************************************* */ getarg(args); /*Pointer to arg list*/ queue_iload(X_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); //0 queue_iload(Y_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); //1 queue_iload(F_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); //2 queue_iload(length,ZERO_IMM,args); queue_load_addr(args,Isize,args); //3 queue_iload(Ls, ZERO_IMM,args); queue_load_addr(args,Isize,args); //4 queue_iload(tab, ZERO_IMM,args); queue_load_addr(args,Isize,args); //5 queue_iload(Complex_i,ZERO_IMM,args);queue_load_addr(args,Isize,args); //6 queue_load_addr(args,Isize,args); //7 queue_iload(recbuf_base,ZERO_IMM,args);queue_load_addr(args,Isize,args); //8 /************************************************** * Load common constants into Iregs ************************************************** */ for (int i =0; i<12; i++ ) { need_constant(i*2*SizeofDatum(FourSpinType)*nsimd()); } for (int i =0; i<9; i++ ) { need_constant(i*2*SizeofDatum(GaugeType)*nsimd()); } complex_constants_prepare(creg,Complex_i); complex_load(Z,CONST_ZERO_OFFSET,Complex_i,Double); // Site loop retno = get_target_label(); check_iterations(length,retno); int branchsite = start_loop(length); // S loop queue_iload_short(mask,TAB_IMM[10],tab); queue_iadd_imm (s,Ls,ZERO_IMM); queue_iload_imm(s_offset,ZERO_IMM); int branchls = start_loop(s); queue_iadd_imm(F_p_s,F_p,ZERO_IMM); // debugI(s); // Loop over directions for ( int mu=0;mu<4;mu++ ) { int dir = mu*2+1; // Always in forward dir // Complex branch structure for interior/exterior neighbours int lab_proj_mu = get_target_label(); int lab_continue = get_target_label(); queue_iand_imm (Y_mu,mask,mask_imm[dir]); // non-zero if exterior check_iterations(Y_mu,lab_proj_mu); // Exterior points are already projected. Just load. queue_iload_short(Y_mu,TAB_IMM[dir],tab); queue_iadd (Y_mu,Y_mu,recbuf_base); // debugI(Y_mu); //debugI(recbuf_base); queue_iadd (Y_mu,Y_mu,s_offset); for(int sp=0;sp<2;sp++){ for(int co=0;co<3;co++){ complex_load(Y[sp][co],PSIIMM[sp][co][0],Y_mu,FourSpinType); } } jump(lab_continue); make_inst(DIRECTIVE,Target,lab_proj_mu); // Interior points are not already projected. // * Spin project 4 spinor queue_iload_short(Y_mu,TAB_IMM[dir],tab); // debugI(tab); // debugI(Y_mu); queue_iadd (Y_mu,Y_mu,Y_p); queue_iadd (Y_mu,Y_mu,s_offset); // offset for this "s" queue_iadd (Y_mu,Y_mu,s_offset); // offset for this "s" for(int sp=0;sp<4;sp++){ for(int co=0;co<3;co++){ complex_load(X[sp][co],PSIIMM[sp][co][0],Y_mu,FourSpinType); // debugC(X[sp][co]); } } int pm = 1; // pm=0 == 1+gamma, pm=1 => 1-gamma if ( dagger ) pm = 0; if ( mu == 0 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_ApiB(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_ApiB(Y[1][co],X[1][co],X[2][co]); } else { for(co=0;co<3;co++) complex_AmiB(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_AmiB(Y[1][co],X[1][co],X[2][co]); } } else if ( mu == 1 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_sub(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_add(Y[1][co],X[1][co],X[2][co]); } else { for(co=0;co<3;co++) complex_add(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_sub(Y[1][co],X[1][co],X[2][co]); } } else if ( mu == 2 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_ApiB(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_AmiB(Y[1][co],X[1][co],X[3][co]); } else { for(co=0;co<3;co++) complex_AmiB(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_ApiB(Y[1][co],X[1][co],X[3][co]); } } else if ( mu == 3 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_add(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_add(Y[1][co],X[1][co],X[3][co]); } else { for(co=0;co<3;co++) complex_sub(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_sub(Y[1][co],X[1][co],X[3][co]); } } make_inst(DIRECTIVE,Target,lab_continue); /////////////////////////////////////////////////////////////// // Y contains spin projection of forward neighbour in mu direction // Repromote to Y to 4 spinor /////////////////////////////////////////////////////////////// for(int co_y=0;co_y<3;co_y++){ if ( (mu==0) && (pm==0) ) complex_AmiB(Y[2][co_y],Z,Y[1][co_y]); if ( (mu==0) && (pm==1) ) complex_ApiB(Y[2][co_y],Z,Y[1][co_y]); if ( (mu==1) && (pm==0) ) complex_add (Y[2][co_y],Z,Y[1][co_y]); if ( (mu==1) && (pm==1) ) complex_sub (Y[2][co_y],Z,Y[1][co_y]); if ( (mu==2) && (pm==0) ) complex_AmiB(Y[2][co_y],Z,Y[0][co_y]); if ( (mu==2) && (pm==1) ) complex_ApiB(Y[2][co_y],Z,Y[0][co_y]); if ( (mu==3) && (pm==0) ) complex_add (Y[2][co_y],Z,Y[0][co_y]); if ( (mu==3) && (pm==1) ) complex_sub (Y[2][co_y],Z,Y[0][co_y]); if ( (mu==0) && (pm==0) ) complex_AmiB(Y[3][co_y],Z,Y[0][co_y]); if ( (mu==0) && (pm==1) ) complex_ApiB(Y[3][co_y],Z,Y[0][co_y]); if ( (mu==1) && (pm==0) ) complex_sub (Y[3][co_y],Z,Y[0][co_y]); if ( (mu==1) && (pm==1) ) complex_add (Y[3][co_y],Z,Y[0][co_y]); if ( (mu==2) && (pm==0) ) complex_ApiB(Y[3][co_y],Z,Y[1][co_y]); if ( (mu==2) && (pm==1) ) complex_AmiB(Y[3][co_y],Z,Y[1][co_y]); if ( (mu==3) && (pm==0) ) complex_add (Y[3][co_y],Z,Y[1][co_y]); if ( (mu==3) && (pm==1) ) complex_sub (Y[3][co_y],Z,Y[1][co_y]); } /////////////////////////////////////////////////////////////// // Load X /////////////////////////////////////////////////////////////// for(int co_x=0;co_x<3;co_x++){ for(int sp=0;sp<4;sp++) { complex_load(X[sp][co_x],PSIIMM[sp][co_x][0],X_p,FourSpinType); } } /////////////////////////////////////////////////////////////// // Spin trace tensor product /////////////////////////////////////////////////////////////// for(int co_x=0;co_x<3;co_x++){ // Spin trace outer product for ( int co_y=0;co_y<3;co_y++) complex_load (F[co_y],GIMM[co_y][co_x][0],F_p_s); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[0][co_x],Y[0][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[1][co_x],Y[1][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[2][co_x],Y[2][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[3][co_x],Y[3][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_store(F[co_y],GIMM[co_y][co_x][0],F_p_s); } queue_load_addr(F_p_s,MAT_IMM,F_p_s); } queue_iadd_imm(X_p,X_p,PSI_IMM); queue_iadd_imm(s_offset,s_offset,CHI_IMM); stop_loop(branchls,s); queue_iadd_imm(F_p,F_p_s,ZERO_IMM); queue_load_addr(tab,TAB_IMM[16],tab); stop_loop(branchsite,length); make_inst(DIRECTIVE,Target,retno); /* * * EPILOGUE * */ restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }
void qcdoc_su3_recon( char *name) { /**** This section defines all the registers and offsets I need ****/ /* * This marks the argument registers as defined by ABI as off limits * to us until they are freed by "getarg()"; */ int dum = defargcount(4); /*Handle for the loop entry point*/ int branchsite; int branchmu; int retno ; /*------------------------------------------------------------------ * Floating point registers *------------------------------------------------------------------ */ // Reconstruct 8 registers for 4 spinor // reg_array_2d(PSI,Fregs,4,2); reg_array_3d(PSI,Fregs,3,4,2); offset_3d(PSI_IMM,FourSpinType,4,3,2); /*Offsets within 4 spinor*/ // Reconstruct 2 spinor registers #define NEO 2 reg_array_3d(Atmp,Fregs,1,2,2); /*CHIplus regs */ reg_array_3d(Btmp,Fregs,1,2,2); /*CHIminus regs */ int A[NEO][2][2] = { Atmp[0][0][0], Atmp[0][0][1], Atmp[0][1][0], Atmp[0][1][1], -1,-1,-1,-1 }; int B[NEO][2][2] = { Btmp[0][0][0], Btmp[0][0][1], Btmp[0][1][0], Btmp[0][1][1], -1,-1,-1,-1 }; /*Regs for SU3 two spinor multiply ... overlap with the reconstruct*/ /* registers */ int CHIR[3][2][2] = { A[0][0][0],A[0][0][1], A[0][1][0],A[0][1][1], B[0][0][0],B[0][0][1], B[0][1][0],B[0][1][1], PSI[0][0][0],PSI[0][0][1], PSI[0][1][0],PSI[0][1][1] }; offset_3d(CHI_IMM,TwoSpinType,3,2,2); /*Registers for the gauge link (2 rows)*/ int UA[3][2] = { {PSI[0][2][0],PSI[0][2][1]}, {PSI[2][1][0],PSI[2][1][1]}, {PSI[1][0][0],PSI[1][0][1]} }; int UB[3][2] = { {PSI[1][1][0],PSI[1][1][1]}, {PSI[2][0][0],PSI[2][0][1]}, {PSI[1][2][0],PSI[1][2][1]}, }; offset_3d(GIMM , GaugeType, 3, 3 ,2 ); // Other 8 registers used for reduction variables in SU3. // Could use these in reconstruct?? int E[2] = { PSI[2][2][0],PSI[2][2][1]}; /* * FCD used for drain of Chi * Overlap with PSI[*][3][*] */ int F[2] = {PSI[0][3][0],PSI[0][3][1]}; int C[2] = {PSI[1][3][0],PSI[1][3][1]}; int D[2] = {PSI[2][3][0],PSI[2][3][1]}; /* * Integer registers */ alreg(psi,Iregs); alreg(Umu,Iregs); alreg(Ufetch,Iregs); alreg(Chiin,Iregs); alreg(Chiout,Iregs); alreg(Chifetch,Iregs); reg_array_1d(Chiplus,Iregs,4);/*Pointers to the 8 2-spinors for recombination*/ reg_array_1d(Chiminus,Iregs,4); alreg(mu,Iregs); alreg(Chidrain,Iregs); alreg(pref,Iregs); alreg(mem,Iregs); alreg(length,Iregs); int Isize = PROC->I_size; int Fsize = PROC->FP_size; def_off( ZERO_IMM, Byte,0); def_off( PSI_ATOM, FourSpinType, 24); def_off( CHI_ATOM, TwoSpinType, 12); def_off( PAD_CHI_ATOM, TwoSpinType, 16); def_off( MAT_IMM, GaugeType, 18); int Ndim = def_offset(4,Byte,"Ndim"); int Ndimm1 = def_offset(3,Byte,"Ndimm1"); int hbias,bias; /*Offsets handles to stack*/ int hbitbucket = def_offset(16*Isize,Byte,"hbitbucket"); int Tsize; if ( TwoSpinType == Double ) Tsize = PROC->FP_size; else Tsize = PROC->FSP_size; int hstk0 = def_offset(16*Isize+12*Tsize ,Byte,"hstk0"); int hstk1 = def_offset(16*Isize+2*12*Tsize,Byte,"hstk1"); int hstk2 = def_offset(16*Isize+3*12*Tsize,Byte,"hstk2"); int hstk3 = def_offset(16*Isize+4*12*Tsize,Byte,"hstk3"); int hIsize = def_offset(Isize,Byte,"Isize"); int i,co,j,k,nxt,ri,sp,nxtco,eop,eo_a,eo_b; /***********************************************************************/ /* * PROLOGUE */ make_inst(DIRECTIVE,Enter_Routine,name); /*Allocate stack save any callee save registers we need etc...*/ int stack_buf_size; stack_buf_size = 16*Isize + 12*Fsize * 5 ; hbias = grab_stack(stack_buf_size); bias = get_offset(hbias); save_regs(); queue_iadd_imm(mem,PROC->StackPointer,hbias); /*Pointer to buf on stack*/ /*Define our arguments - all pointers ala fortran*/ getarg(psi); getarg(Umu); getarg(Chiin); getarg(length); /*{... Process arguments ...*/ queue_iload(length,ZERO_IMM,length); /*Load in sx counter*/ retno = get_target_label(); /*Branch to exit if yzt <1*/ check_iterations(length,retno); need_cache_line(0); need_cache_line(1); need_cache_line(2); need_cache_line(3); need_cache_line(4); pragma(DCBT_SPACE,5); pragma(DCBT_POST,1); #define LOAD_U(comin,comax)\ /*Load two link rows*/\ for( i = comin;i<=comax;i++ ){\ for( ri=0;ri<2;ri++){ \ queue_fload(UA[i][ri],GIMM[i][0][ri],Umu,GaugeType);\ queue_fload(UB[i][ri],GIMM[i][1][ri],Umu,GaugeType);\ } \ } #define PRELOAD_U LOAD_U(0,1) #define POSTLOAD_U LOAD_U(2,2) PRELOAD_U #define LOAD_CHI(comin,comax) \ /*Load Chi column*/\ for( i = comin;i<=comax;i++ ){\ for( ri=0;ri<2;ri++){\ queue_fload(CHIR[i][0][ri],CHI_IMM[i][0][ri],Chiin,TwoSpinType);\ } \ for( ri=0;ri<2;ri++){\ queue_fload(CHIR[i][1][ri],CHI_IMM[i][1][ri],Chiin,TwoSpinType);\ } \ } #define PRELOAD_CHI LOAD_CHI(0,1) #define POSTLOAD_CHI LOAD_CHI(2,2) #define POSTLOAD \ POSTLOAD_CHI \ POSTLOAD_U do_prefetch(Chiin,0); do_prefetch(Chiin,1); if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chiin,2); PRELOAD_CHI /* * Start site loop */ queue_iadd_imm(Chidrain,mem,hbitbucket); branchsite = start_loop(length); queue_iadd_imm(Chiout,mem,hstk0); /* * Loop over mu in asm */ queue_iload_imm(mu,Ndimm1); #define CHIDRAIN \ queue_fstore(F[0],CHI_IMM[1][1][0],Chidrain,TwoSpinType);\ queue_fstore(F[1],CHI_IMM[1][1][1],Chidrain,TwoSpinType);\ queue_fstore(C[0],CHI_IMM[2][0][0],Chidrain,TwoSpinType);\ queue_fstore(C[1],CHI_IMM[2][0][1],Chidrain,TwoSpinType);\ queue_fstore(D[0],CHI_IMM[2][1][0],Chidrain,TwoSpinType);\ queue_fstore(D[1],CHI_IMM[2][1][1],Chidrain,TwoSpinType); #define PREFETCH_CHI \ queue_iadd_imm(Chifetch,Chiin,PAD_CHI_ATOM);\ do_prefetch(Chifetch,0);\ do_prefetch(Chifetch,1);\ if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2); #define PREFETCH_CHIF \ queue_iadd_imm(Chifetch,Chifetch,PAD_CHI_ATOM);\ do_prefetch(Chifetch,0);\ do_prefetch(Chifetch,1);\ if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2); for ( int unroll=0; unroll<2; unroll++ ) { if ( unroll==0 ) { branchmu = start_loop(mu); pragma(DCBT_SPACE,5); pragma(STORE_LIM,1); pragma(LOAD_LIM,2); } else { pragma(STORE_LIM,2); pragma(DCBT_SPACE,5); pragma(DCBT_POST,1); pragma(DCBT_PRE,0); pragma(LOAD_LIM,2); } CHIDRAIN POSTLOAD if ( unroll == 0 ) { PREFETCH_CHI queue_iadd_imm(Ufetch,Umu,MAT_IMM); do_prefetch(Ufetch,0); do_prefetch(Ufetch,1); do_prefetch(Ufetch,2); if ( GaugeType == Double ) { do_prefetch(Ufetch,3); do_prefetch(Ufetch,4); } } else { pragma(DCBT_SPACE,3); PREFETCH_CHI PREFETCH_CHIF PREFETCH_CHIF PREFETCH_CHIF } j=0; queue_three_cmuls(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); j=1; queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); j=2; queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); /*Store the first three results*/ queue_fstore(C[0],CHI_IMM[0][0][0],Chiout,TwoSpinType); queue_fstore(C[1],CHI_IMM[0][0][1],Chiout,TwoSpinType); queue_fstore(D[0],CHI_IMM[0][1][0],Chiout,TwoSpinType); queue_fstore(D[1],CHI_IMM[0][1][1],Chiout,TwoSpinType); queue_fstore(E[0],CHI_IMM[1][0][0],Chiout,TwoSpinType); queue_fstore(E[1],CHI_IMM[1][0][1],Chiout,TwoSpinType); /*Load the third row*/ for(j=0; j<3; j++) { for(ri=0; ri<2; ri++) { queue_fload(UA[j][ri],GIMM[j][2][ri],Umu,GaugeType); } } /*Gauge layout is linear, mu faster than site*/ queue_iadd_imm(Umu,Umu,MAT_IMM); /*Now the second set of three cdots*/ j=0; queue_three_cmuls(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); j=1; queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); j=2; queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); /**************END SU3 CODE *************/ queue_iadd_imm(Chiin,Chiin,PAD_CHI_ATOM); queue_iadd_imm(Chidrain,Chiout,ZERO_IMM); queue_iadd_imm(Chiout,Chiout,CHI_ATOM); if ( unroll == 0 ) { PRELOAD_U PRELOAD_CHI } /*********************************************************/ /****************** END OF SU3 MULTIPLY ******************/ /*********************************************************/ if ( unroll== 0 ) { stop_loop(branchmu,mu); /* End loop over mu*/ make_inst(DIRECTIVE,Target,get_target_label() ); /*delineate the sections*/ } } /*********************************************************/ /****************** START OF RECONSTRUCT *****************/ /*********************************************************/ //Address calculation... // Chiminus -> Stack and ChiPlus -> Chiin pragma(STORE_INORDER,1); queue_iadd_imm(Chiminus[0],mem,hstk0); /*For register use reasons loop over colour outermost*/ #define LOAD_CHI_MU0(eo,co) \ for( sp = 0; sp<2;sp++ ){\ for( ri = 0; ri<2;ri++ ){\ queue_fload(A[eo][sp][ri],CHI_IMM[co][sp][ri],Chiminus[0],TwoSpinType);\ if ( co == 0 ) {\ queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiin,TwoSpinType);\ queue_iadd_imm(Chiplus[0],Chiin,ZERO_IMM);\ } else {\ queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiplus [0],TwoSpinType);\ }\ }} pragma(LOAD_LIM,2); LOAD_CHI_MU0(0,0) pragma(DCBT_POST,1); CHIDRAIN int neo_a = NEO; int neo_b = NEO; eo_a = 0; eo_b = 0; for ( co = 0; co <3 ; co ++ ) { pragma(LOAD_LIM,1); if ( co == 0 ) { // Use the third colour for unrolling the loads A[1][0][0] = PSI[2][0][0]; A[1][0][1] = PSI[2][0][1]; A[1][1][0] = PSI[2][1][0]; A[1][1][1] = PSI[2][1][1]; B[1][0][0] = PSI[2][2][0]; B[1][0][1] = PSI[2][2][1]; B[1][1][0] = PSI[2][3][0]; B[1][1][1] = PSI[2][3][1]; queue_iadd_imm(Chiminus[1],mem,hstk1); // This is invariant of loop // Take out queue_iadd_imm(Chiplus[1],Chiin ,PAD_CHI_ATOM); } /*************************************************************** * MU = 0 reconstruct * ****************************************************************/ if ( co == 2 ) { // Flip to not unrolled due to register pressure neo_b = 1; neo_a = 2; A[1][0][0] = PSI[0][0][0]; A[1][0][1] = PSI[0][0][1]; A[1][1][0] = PSI[1][0][0]; A[1][1][1] = PSI[1][0][1]; pragma(DCBT_POST,0); pragma(DCBT_SPACE,1); queue_iadd_imm(Ufetch,Umu,ZERO_IMM); // do_prefetch(Ufetch,0); do_prefetch(Ufetch,1); do_prefetch(Ufetch,2); if ( GaugeType == Double ) { do_prefetch(Ufetch,3); do_prefetch(Ufetch,4); } } /* psi_0 = Chiplus[0] + Chiminus[0] */ /* psi_1 = Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],B[eo_b][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],B[eo_b][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],B[eo_b][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],B[eo_b][1][1],A[eo_a][1][1]); // Dagger = 0: /* psi_2 =-iChiplus[1] +iChiminus[1] */ /* psi_3 =-iChiplus[0] +iChiminus[0] */ // Dagger = 1: /* psi_2 = iChiplus[1] -iChiminus[1] */ /* psi_3 = iChiplus[0] -iChiminus[0] */ if ( dagger == 0 ) { queue_fsub(PSI[co][2][0],B[eo_b][1][1],A[eo_a][1][1]); queue_fsub(PSI[co][2][1],A[eo_a][1][0],B[eo_b][1][0]); queue_fsub(PSI[co][3][0],B[eo_b][0][1],A[eo_a][0][1]); queue_fsub(PSI[co][3][1],A[eo_a][0][0],B[eo_b][0][0]); } else { queue_fsub(PSI[co][2][0],A[eo_a][1][1],B[eo_b][1][1]); queue_fsub(PSI[co][2][1],B[eo_b][1][0],A[eo_a][1][0]); queue_fsub(PSI[co][3][0],A[eo_a][0][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][1],B[eo_b][0][0],A[eo_a][0][0]); } /*************************************************************** * MU = 1 reconstruct * ****************************************************************/ eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[1],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [1],TwoSpinType); } } if ( co == 0 ) { queue_iadd_imm(Chiminus[2],mem,hstk2); queue_iadd_imm(Chiminus[3],mem,hstk3); queue_iadd_imm(Chiplus[2],Chiplus[1],PAD_CHI_ATOM); queue_iadd_imm(Chiplus[3],Chiplus[2],PAD_CHI_ATOM); } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); //Dagger == 0 /* psi_2 += Chiplus[1] - Chiminus[1] */ /* psi_3 += -Chiplus[0] + Chiminus[0] */ //Dagger == 1 /* psi_2 -= Chiplus[1] - Chiminus[1] */ /* psi_3 -= -Chiplus[0] + Chiminus[0] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]); queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]); queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]); } /*************************************************************** * MU = 2 reconstruct * ****************************************************************/ eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[2],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [2],TwoSpinType); } } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); //Dagger == 0 /* psi_2 +=-iChiplus[0] +iChiminus[0] */ /* psi_3 += iChiplus[1] -iChiminus[1] */ //Dagger == 1 /* psi_2 -=-iChiplus[0] +iChiminus[0] */ /* psi_3 -= iChiplus[1] -iChiminus[1] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]); queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]); queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]); } /*************************************************************** * MU = 3 reconstruct * ****************************************************************/ pragma(LOAD_LIM,2); eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[3],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [3],TwoSpinType ); } } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); //Dagger == 0 /* psi_2 += Chiplus[0] - Chiminus[0] */ /* psi_3 += Chiplus[1] - Chiminus[1] */ //Dagger == 1 /* psi_2 -= Chiplus[0] - Chiminus[0] */ /* psi_3 -= Chiplus[1] - Chiminus[1] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]); } queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); if ( dagger == 0 ) { queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]); } else { queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]); } /* * Store the spinors. If this is problematic * in terms of PEC WriteBuf misses, I could * store to the stack and copy out later. */ if ( co != 2 ) { LOAD_CHI_MU0(0,co+1) eo_a=0; eo_b=0; } queue_fstore(PSI[co][0][0],PSI_IMM[0][co][0],psi,FourSpinType); queue_fstore(PSI[co][0][1],PSI_IMM[0][co][1],psi,FourSpinType); } /* * Store out in linear order now */ pragma(STORE_LIM,2); pragma(DCBT_SPACE,8); for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][1][0],PSI_IMM[1][co][0],psi,FourSpinType); queue_fstore(PSI[co][1][1],PSI_IMM[1][co][1],psi,FourSpinType); } for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][2][0],PSI_IMM[2][co][0],psi,FourSpinType); queue_fstore(PSI[co][2][1],PSI_IMM[2][co][1],psi,FourSpinType); } if ( TwoSpinType == FourSpinType ) { queue_iadd_imm(Chidrain,psi,CHI_ATOM); } else { queue_iadd_imm(Chidrain,mem,hbitbucket); for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][3][0],PSI_IMM[3][co][0],psi,FourSpinType); queue_fstore(PSI[co][3][1],PSI_IMM[3][co][1],psi,FourSpinType); } } queue_iadd_imm(psi,psi,PSI_ATOM); /* * Put in an artificial dependency here * to try to stop the preloads getting above the last load of * reconstruct. */ queue_iadd_imm(Chiplus[3],Chiplus[3],ZERO_IMM); queue_iadd_imm(Chiin ,Chiplus[3],PAD_CHI_ATOM); pragma(DCBT_SPACE,0); do_prefetch(Chiin,0); do_prefetch(Chiin,1); if ( SizeofDatum(TwoSpinType) == 8 )do_prefetch(Chiin,2); PRELOAD_U PRELOAD_CHI /* TERMINATION point of the loop*/ stop_loop(branchsite,length); CHIDRAIN make_inst(DIRECTIVE,Target,retno); /* * * EPILOGUE * */ restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }
static PSW systeme_init(void) { DEBUG_PUTS(DEBUG_T_SYSTEM, "Booting"); process_init(); //Création du processus père process_alloc()->state = PROC_STATE_READY; process_next(); PSW cpu; const int varIndex = 15; /*** creation d'un programme ***/ make_inst(0, INST_SUB, 1, 1, 0); //R1 = 0 make_inst(1, INST_SUB, 2, 2, 0); //R2 = 0 make_inst(2, INST_STORE, 1, 1, varIndex); //mem[15] = 0 make_inst(3, INST_SYSC, 2, 2, SYSC_FORK); //R2 = fork() make_inst(4, INST_IFGT, 0, 0, 10); //if(AC == 0) faire fils, sinon le père //Code fils make_inst(5, INST_ADD, 1, 0, 1); //R1 = 1 make_inst(6, INST_SYSC, 1, 0, SYSC_PUTI); //puti(1) make_inst(7, INST_SYSC, 1, 0, SYSC_SLEEP);//sleep(1) make_inst(8, INST_SYSC, 1, 0, SYSC_PUTI);//puti(1) make_inst(9, INST_SYSC, 0, 0, SYSC_EXIT);//exit() //Code Père make_inst(10, INST_ADD, 1, 0, 1); //R1 = 4 make_inst(11, INST_SYSC, 1, 0, SYSC_SLEEP); //sleep(4) make_inst(12, INST_JUMP, 0, 0, 3); //refait le fork make_inst(13, INST_SYSC, 2, 0, SYSC_PUTI); //puti(R2) make_inst(14, INST_JUMP, 0, 0, 11); /*** valeur initiale du PSW ***/ memset(&cpu, 0, sizeof (PSW)); cpu.IN = INT_NONE; cpu.PC = 0; cpu.SB = 0; cpu.SS = 20; return cpu; }
void jump(int where) { make_inst(BRCHPIPE,BRANCH,where); }
void queue_cmovge_knc(int cond, int src, int dst) { make_inst(IALUPIPE,IOR_PREDICATE,cond,cond); make_inst(IALUPIPE,CMOVLT,dst,src); }
/* caller should now resolve word_id against word_form_index before calling and pass the result as form arg if non-NULL; NULL arg means form is embedded in lemma */ void ilem_parse(struct xcl_context *xc, struct ilem_form *master_formp) { unsigned char *lem; int newflag = 0; extern const char *phase; unsigned char *lemma = NULL; #define LANGBUF_LEN 32 char langbuf[LANGBUF_LEN+1]; #if 0 #define FORMBUF_LEN 128 char formbuf[FORMBUF_LEN+1]; #endif struct xcl_l *master_lp = NULL; if (!xc) { vwarning("internal error: ilem_parse called with NULL args"); return; } if (!master_formp) { /* this can happen after ATF parse errors */ return; } phase = "lem"; /*#define lemma (master_formp->literal)*/ if (master_formp->literal) { lemma = npool_copy((unsigned char *)master_formp->literal, xc->pool); } else { struct xcl_l*lp = xcl_lemma(xc,NULL,master_formp->ref,NULL,NULL,0); lp->lnum = master_formp->lnum; lp->f = master_formp; lp->inst = make_inst(xc,lp->f); phase = NULL; return; } if (NULL == master_formp->f2.lang) { if ('%' == *lemma) { char *langbufp = langbuf; for (++lemma; *lemma != ':' && *lemma != '-'; ) { if (langbufp - langbuf == LANGBUF_LEN) { langbuf[LANGBUF_LEN] = '\0'; vwarning2(file,lnum,"[91]: lang starting with '%s' is too long (MAX %d)",langbuf,LANGBUF_LEN); phase = NULL; return; } else *langbufp++ = *lemma++; } if ('-' == *lemma) { while (*lemma && ':' != *lemma) ++lemma; if (!*lemma) { vwarning2(file,lnum,"[92]: lang starting with '%s' has no ':'",langbuf); phase = NULL; return; } } } else { vwarning2(file,lnum,"[96]: no lang set for form"); phase = NULL; return; } master_formp->f2.lang = npool_copy((unsigned char *)langbuf,xc->pool); master_formp->f2.core = langcore_of(langbuf); } else if ('%' == *lemma && '%' != lemma[1]) { while (*lemma && ':' != *lemma) ++lemma; if (':' != *lemma) { vwarning2(file,lnum,"lang has no ':'"); return; } ++lemma; } #if 0 /* In L1 this routine had to handle lems with a form prepended and separated by * (not = , because that conflicts with = in ASCII macron). This is no longer the case in L2 */ if (NULL == master_formp->f2.form) { char *formbufp = formbuf; while (*lemma != '*') { if (formbufp - formbuf == FORMBUF_LEN) { formbuf[10] = '\0'; vwarning2(file,lnum,"[94]: form starting '%s' is too long (MAX %d)",formbuf,FORMBUF_LEN); phase = NULL; return; } *formbufp++ = *lemma++; } if ('*' != *lemma) { formbuf[10] = '\0'; vwarning2(file,lnum,"[95]: form starting '%s' has no '*'",formbuf,FORMBUF_LEN); phase = NULL; return; } ++lemma; } #endif /* Now we know that lemma points to the start of the lemmatization */ lem_init((const unsigned char *)lemma); /* This outer loop splits on '&' */ while (1) { struct xcl_l*lp; int alt_count = 0; int iflags = 0; struct ilem_form *curr_f = NULL; lem = lem_next(xc); if (!lem) break; lp = xcl_lemma(xc,NULL,master_formp->ref,NULL,NULL,0); lp->inst = master_formp->literal; lp->lnum = lnum; lp->ante_para = ilem_para_parse(xc, lem,&lem,master_formp->lnum, ilem_para_pos_ante); if (lem) { unsigned char *post = NULL; while (isspace(*lem)) ++lem; post = lem_end(lem); lp->post_para = ilem_para_parse(xc, post,NULL,master_formp->lnum, ilem_para_pos_post); if (isspace(*post)) { while (post > lem && isspace(post[-1])) --post; *post = '\0'; } ilem_para_boundaries(lp,xc); } else { vwarning2(file,master_formp->lnum,"[96]: lem `%s' failed syntax stripping",lem); break; } alt_init(lem); if (master_formp->mcount) { struct ilem_form *mrover = NULL; /*lp->f = NULL;*/ /* NEW ILEM_FORM form_allocator();*/ lp->f = mb_new(xc->sigs->mb_ilem_forms); lp->f->newflag = newflag; lp->f->f2.lang = master_formp->f2.lang; lp->f->f2.core = master_formp->f2.core; lp->f->mcount = -1; if (master_formp->mcount == 1) { master_formp->type = "cof-head"; master_lp->cof_tails = list_create(LIST_SINGLE); } lp->f->type = "cof-tail"; lp->cof_head = master_lp; list_add(lp->cof_head->cof_tails, lp); ++master_formp->mcount; /* efficiency doesn't matter here as we will have relatively few of these */ for (mrover = master_formp; mrover->multi; mrover = mrover->multi) ; mrover->multi = lp->f; /*lp->f->master = master_formp;*/ lp->f->file = master_formp->file; lp->f->lnum = master_formp->lnum; lp->ref = lp->f->ref = master_formp->ref; lp->f->f2.form = master_formp->f2.form; lp->f->literal = NULL; } else { lp->f = master_formp; lp->f->mcount = 1; lp->f->newflag = newflag; lp->ref = lp->f->ref; lp->f->type = NULL; master_lp = lp; } lp->f->instance_flags = iflags; /* This inner loop splits on '|'; it is where each lemma is actually handled */ while (1) { lem = alt_next(xc); if (!lem) break; iflags = 0; while (lem_iflags[*lem]) { switch (*lem) { case '+': ++lem; /*newflag = !ignore_plus; */ BIT_SET(iflags, F2_FLAGS_LEM_NEW); break; case '!': ++lem; BIT_SET(iflags, F2_FLAGS_PSU_STOP); break; case '-': ++lem; BIT_SET(iflags, F2_FLAGS_PSU_SKIP); break; case '`': lem = (unsigned char *)"X"; break; } } if (bootstrap_mode && !BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) BIT_SET(iflags, F2_FLAGS_LEM_NEW); if (BIT_ISSET(iflags,F2_FLAGS_LEM_NEW)) { char *tmp = malloc(strlen(lem) + 2); sprintf(tmp, "+%s", lem); lem = npool_copy(tmp, xc->pool); free(tmp); } if (alt_count++) { struct ilem_form *last_alt = NULL, *f = NULL; if (!lem) break; /*f->f2 = NULL form_allocator();*/ f = mb_new(xc->sigs->mb_ilem_forms); /* f->newflag = newflag; */ lp->f->ref = master_formp->ref; f->f2.lang = master_formp->f2.lang; f->f2.core = master_formp->f2.core; f->f2.form = master_formp->f2.form; if (BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) { BIT_SET(f->f2.flags, F2_FLAGS_LEM_NEW); if ('+' == *lem) /* should always be true */ ++lem; } f->lnum = master_formp->lnum; f->file = master_formp->file; f->instance_flags = iflags; f->sublem = (char*)npool_copy(lem,xc->pool); /* link this into the master_formp */ for (last_alt = master_formp; last_alt->ambig; last_alt = last_alt->ambig) ; curr_f = last_alt->ambig = f; } else { lp->f->sublem = (char*)npool_copy(lem,xc->pool); curr_f = lp->f; if (BIT_ISSET(iflags, F2_FLAGS_LEM_NEW)) { BIT_SET(curr_f->f2.flags, F2_FLAGS_LEM_NEW); if ('+' == *lem) /* should always be true */ ++lem; } } /* Instance parsing cannot result in a form with && being processed using f2_parse_cof, so we can just pass a NULL final argument */ f2_parse((Uchar*)lp->f->file, lp->f->lnum, lem, &curr_f->f2, (Uchar**)&curr_f->psu_sense, NULL); if (check_cf((char*)lp->f->file, lp->f->lnum, (char*)curr_f->f2.cf)) BIT_SET(curr_f->f2.flags, F2_FLAGS_INVALID); if (curr_f->lang) { curr_lang = curr_f->lang; if (!BIT_ISSET(curr_f->f2.flags,F2_FLAGS_CF_QUOTED)) curr_f->f2.cf = ilem_conv(lp,curr_f->f2.cf); curr_f->f2.norm = ilem_conv(lp,curr_f->f2.norm); curr_f->f2.base = ilem_conv(lp,curr_f->f2.base); curr_f->f2.cont = ilem_conv(lp,curr_f->f2.cont); } curr_f->sublem = make_inst(xc,curr_f); } } }
int main(int argc, char* argv[]) { if(argc != 3) { fprintf(stderr, "wrong number of files\n"); exit(1); } FILE *fp = fopen(argv[1], "r"); FILE *write = fopen(argv[2], "wb"); if (fp == NULL) { fprintf(stderr, "can't open read file\n"); exit(1); } if (write == NULL) { fprintf(stderr, "can't open write file\n"); exit(1); } char line[100]; // char* opcode = NULL; uint32_t opcode; uint32_t value; uint32_t ra; uint32_t rb; uint32_t rc; int num_inst = 0; if (fgets(line, 100, fp) != NULL) { sscanf(line, "%d", &num_inst); } //uint32_t data[num_inst]; //int i = 0; uint32_t word; while (fgets(line, 500, fp) != NULL) { sscanf(line, "%u %u %u %u %u", &opcode, &value, &ra, &rb, &rc); //op = convert_opcode(opcode); word = make_inst(opcode, value, ra, rb, rc); //i++; for (int j = 3; j>= 0; j--) { putc((unsigned char)Bitpack_getu(word, 8, j*8), write); } } /* for (int i = 0; i < num_inst; i++) { fprintf(stderr, "%u\n", data[i]); } */ //fwrite(data, sizeof(uint32_t), num_inst, write); fclose(fp); fclose(write); return 0; }
void check_iterations_usparcIIs(int cntreg,int retno) { make_inst(IALUPIPE,IOR_PREDICATE,cntreg,cntreg,cntreg); make_inst(BRCHPIPE,BRANCH_LE,retno); /*Branch if cntreg <= 0*/ }
void qcdoc_merge( char *name) { int dum = defargcount(5); /*Integer register usage*/ alreg(outptr,Iregs); alreg(vec1ptr,Iregs); alreg(vec2ptr,Iregs); alreg(counter,Iregs); /*Floating register usage*/ reg_array_1d(vec1,Cregs,3); reg_array_1d(vec2,Cregs,3); reg_array_1d(oreg,Cregs,6); alreg(permreg,Cregs); def_off(ZERO,SpinorType,0); def_off (IN_ATOM,SpinorType,6*nsimd()); // 2spins worth, 3 colors x complex def_off (OUT_ATOM,SpinorType,12*nsimd());// 2spins worth, 3 colors x complex x simd def_off(bits16,Byte,0xFFFF); def_off(thirtytwo,Byte,32); def_off(sixteen,Byte,16); offset_2d(CHI_IMM,SpinorType,6,2*nsimd()); int Isize = PROC->I_size; int word_size = def_offset(Isize,Byte,"word_size"); struct stream *PreOut; struct stream *PreVec1; struct stream *PreVec2; int brchno,retno; /*Branch target handles*/ int co; make_inst(DIRECTIVE,Enter_Routine,name); int bias = grab_stack(64); save_regs(); queue_iadd_imm(PROC->StackPointer,PROC->StackPointer,bias); getarg(outptr); /*Get args*/ getarg(vec1ptr); getarg(vec2ptr); getarg(counter); alreg(Mask,Iregs); alreg(Convert1,Iregs); alreg(Convert2,Iregs); int memory = PROC->StackPointer; for (int i =0; i<6; i++ ) { need_constant(i*2*SizeofDatum(SpinorType)*nsimd()); } need_constant(64); complex_simd_init(permreg); if ( half_precision ) { queue_iload_imm(Mask,ZERO); queue_ori(Mask,Mask,bits16); queue_lshift(Mask,Mask,thirtytwo); queue_ori(Mask,Mask,bits16); queue_lshift(Mask,Mask,sixteen); } /* * Insert a label to prevent reordering */ make_inst(DIRECTIVE,Target,get_target_label()); PreVec1= create_stream(IN_ATOM,vec1ptr ,counter,STREAM_IN ,LINEAR); PreVec2= create_stream(IN_ATOM,vec2ptr ,counter,STREAM_IN ,LINEAR); PreOut = create_stream(OUT_ATOM,outptr ,counter,STREAM_OUT ,LINEAR); /*Branch to stack restore if length <1*/ retno = get_target_label(); check_iterations(counter,retno); /* * Start software pipeline */ brchno = start_loop(counter); int indco[3]={0,1,2}; int permute_mu=3; for(int ico=0;ico<3;ico++){ co = indco[ico]; // Could do entirely in integer unit for half precision to accelerate this if ( half_precision ) { complex_load_half(vec1[co],CHI_IMM[co][0],vec1ptr,memory,Convert1,Convert2,Mask); complex_load_half(vec2[co],CHI_IMM[co][0],vec2ptr,memory,Convert1,Convert2,Mask); } else { complex_load(vec1[co],CHI_IMM[co][0],vec1ptr,SpinorType); complex_load(vec2[co],CHI_IMM[co][0],vec2ptr,SpinorType); } } { // Merge the vectors for(co=0;co<3;co++) complex_simd_merge (0,permute_mu,oreg[co*2] ,vec1[co],vec2[co]); for(co=0;co<3;co++) complex_simd_merge (1,permute_mu,oreg[co*2+1],vec1[co],vec2[co]); } // make_inst(DIRECTIVE,LS_BARRIER); for(int i=0;i<6;i++){ // 2 SIMD sites, 3 colors, 2 spins 2 complex == 24 floats if ( half_precision ) { complex_store_half(oreg[i],CHI_IMM[i][0],outptr,memory,Convert1,Convert2,Mask); } else { complex_store(oreg[i],CHI_IMM[i][0],outptr,SpinorType); } } iterate_stream(PreVec1); iterate_stream(PreVec2); do_prefetch(vec1ptr,0); do_prefetch(vec2ptr,0); do_prefetch(vec1ptr,1); do_prefetch(vec2ptr,1); iterate_stream(PreOut); stop_loop(brchno,counter); make_inst(DIRECTIVE,Target,retno); queue_isub_imm(PROC->StackPointer,PROC->StackPointer,bias); restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }
void stop_loop_knc(int branchno, int counter) { queue_load_addr(counter,Hdecrement,counter); /* Decrements counter by Hdecrement */ make_inst(IALUPIPE,IOR_PREDICATE,counter,counter); /* Sets the predicate reg*/ make_inst(BRCHPIPE,BRANCH_GT,branchno); }
void check_iterations_knc(int cntreg, int retno) { make_inst(IALUPIPE, IOR_PREDICATE,cntreg,cntreg); /*Sets the predicate reg*/ make_inst(BRCHPIPE, BRANCH_LE,retno); /*Branch if cntreg <= 0*/ }
void queue_cmovge_usparcIIs(int cond,int src,int dst) { make_inst(IALUPIPE,CMOVGE,dst,cond,src); }
void clov_apply( char *name) { /* * This marks the argument registers as defined by ABI as off limits * to us until they are freed by "getarg()"; */ int dum = defargcount(1); int retno; int branchsite; /*------------------------------------------------------------------------------- * registers used *------------------------------------------------------------------------------- */ reg_array_1d(Chi ,Cregs,6); // 2 spionr - 6 regs reg_array_1d(Psi ,Cregs,6); // 2 spinors - 6 regs reg_array_1d(Clo ,Cregs,15); offset_2d(PSIIMM,FourSpinType,6,2*nsimd()); offset_2d(CLOIMM,CloverType,21,2*nsimd()); /* * Integer registers */ alreg(Clo_p,Iregs); /*Pointer to the current cpt of gauge field */ alreg(Chi_p,Iregs); /*Pointer to the input four spinor */ alreg(Psi_p,Iregs); /*Pointer to current cpt output PSI field */ alreg(length,Iregs); /*number of sites*/ //alreg(tab,Iregs); /*Pointer to current entry in offset table*/ alreg(args,Iregs); /*Useful integer immediate constants, in units of Fsize*/ def_off(ZERO_IMM,Byte,0); def_off(SPINOR, FourSpinType, 12*nsimd()); def_off(CLOVER, CloverType, 42*nsimd()); int Isize = def_offset(PROC->I_size,Byte,"Isize"); /*-------------------------------------------------------------------- * Start of the "pseudo assembler proper. *-------------------------------------------------------------------- */ make_inst(DIRECTIVE,Enter_Routine,name); grab_stack(0); save_regs(); /* * Define our arguments */ getarg(args); /*Pointer to arg list*/ queue_iload(Chi_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); queue_iload(Psi_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); queue_iload(Clo_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); queue_iload(length,ZERO_IMM,args); for (int i =0; i<6; i++ ) { need_constant(i*2*SizeofDatum(FourSpinType)*nsimd()); } for (int i =0; i<21; i++ ) { need_constant(i*2*SizeofDatum(CloverType)*nsimd()); } retno = get_target_label(); /*Branch to exit if length <1*/ check_iterations(length,retno); /* * Site loop */ branchsite = start_loop(length); for(int pp=0; pp<6; pp++) { complex_load(Psi[pp],PSIIMM[pp][0],Psi_p,FourSpinType); } for(int qq=0; qq<6; qq++) { complex_load(Clo[qq],CLOIMM[qq][0],Clo_p,CloverType); } for(int rr=0; rr<6; rr++) { complex_mul (Chi[rr], Clo[rr], Psi[rr]); } for(int ss=0; ss<15; ss++) { complex_load(Clo[ss],CLOIMM[ss+6][0],Clo_p,CloverType); } complex_conjmadd(Chi[0], Clo[0], Psi[1]); complex_conjmadd(Chi[0], Clo[1], Psi[2]); complex_conjmadd(Chi[0], Clo[3], Psi[3]); complex_conjmadd(Chi[0], Clo[6], Psi[4]); complex_conjmadd(Chi[0], Clo[10], Psi[5]); complex_madd (Chi[1], Clo[0], Psi[0]); complex_conjmadd(Chi[1], Clo[2], Psi[2]); complex_conjmadd(Chi[1], Clo[4], Psi[3]); complex_conjmadd(Chi[1], Clo[7], Psi[4]); complex_conjmadd(Chi[1], Clo[11], Psi[5]); complex_madd (Chi[2], Clo[1], Psi[0]); complex_madd (Chi[2], Clo[2], Psi[1]); complex_conjmadd(Chi[2], Clo[5], Psi[3]); complex_conjmadd(Chi[2], Clo[8], Psi[4]); complex_conjmadd(Chi[2], Clo[12], Psi[5]); complex_madd (Chi[3], Clo[3], Psi[0]); complex_madd (Chi[3], Clo[4], Psi[1]); complex_madd (Chi[3], Clo[5], Psi[2]); complex_conjmadd(Chi[3], Clo[9], Psi[4]); complex_conjmadd(Chi[3], Clo[13], Psi[5]); complex_madd (Chi[4], Clo[6], Psi[0]); complex_madd (Chi[4], Clo[7], Psi[1]); complex_madd (Chi[4], Clo[8], Psi[2]); complex_madd (Chi[4], Clo[9], Psi[3]); complex_conjmadd(Chi[4], Clo[14], Psi[5]); complex_madd (Chi[5], Clo[10], Psi[0]); complex_madd (Chi[5], Clo[11], Psi[1]); complex_madd (Chi[5], Clo[12], Psi[2]); complex_madd (Chi[5], Clo[13], Psi[3]); complex_madd (Chi[5], Clo[14], Psi[4]); for(int sp=0; sp<6; sp++) { complex_store(Chi[sp],PSIIMM[sp][0],Chi_p,FourSpinType); } queue_iadd_imm(Psi_p,Psi_p,SPINOR); queue_iadd_imm(Chi_p,Chi_p,SPINOR); queue_iadd_imm(Clo_p,Clo_p,CLOVER); /* TERMINATION point of the loop*/ stop_loop(branchsite,length); make_inst(DIRECTIVE,Target,retno); /* EPILOGUE */ restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }