static void add_memory_record(void *p,unsigned long size) { struct memory_record *mr= (struct memory_record *)malloc(sizeof(struct memory_record)); mr->p= p; mr->size= size; grab_stack(mr->ra,MR_CALL_STACK,1); PR_Lock( mr_tree_lock ); avl_insert( &mr_tree, mr, memory_record_compare, memory_record_duplicate_disallow ); PR_Unlock( mr_tree_lock ); }
void qcdoc_su3_recon( char *name) { /**** This section defines all the registers and offsets I need ****/ /* * This marks the argument registers as defined by ABI as off limits * to us until they are freed by "getarg()"; */ int dum = defargcount(4); /*Handle for the loop entry point*/ int branchsite; int branchmu; int retno ; /*------------------------------------------------------------------ * Floating point registers *------------------------------------------------------------------ */ // Reconstruct 8 registers for 4 spinor // reg_array_2d(PSI,Fregs,4,2); reg_array_3d(PSI,Fregs,3,4,2); offset_3d(PSI_IMM,FourSpinType,4,3,2); /*Offsets within 4 spinor*/ // Reconstruct 2 spinor registers #define NEO 2 reg_array_3d(Atmp,Fregs,1,2,2); /*CHIplus regs */ reg_array_3d(Btmp,Fregs,1,2,2); /*CHIminus regs */ int A[NEO][2][2] = { Atmp[0][0][0], Atmp[0][0][1], Atmp[0][1][0], Atmp[0][1][1], -1,-1,-1,-1 }; int B[NEO][2][2] = { Btmp[0][0][0], Btmp[0][0][1], Btmp[0][1][0], Btmp[0][1][1], -1,-1,-1,-1 }; /*Regs for SU3 two spinor multiply ... overlap with the reconstruct*/ /* registers */ int CHIR[3][2][2] = { A[0][0][0],A[0][0][1], A[0][1][0],A[0][1][1], B[0][0][0],B[0][0][1], B[0][1][0],B[0][1][1], PSI[0][0][0],PSI[0][0][1], PSI[0][1][0],PSI[0][1][1] }; offset_3d(CHI_IMM,TwoSpinType,3,2,2); /*Registers for the gauge link (2 rows)*/ int UA[3][2] = { {PSI[0][2][0],PSI[0][2][1]}, {PSI[2][1][0],PSI[2][1][1]}, {PSI[1][0][0],PSI[1][0][1]} }; int UB[3][2] = { {PSI[1][1][0],PSI[1][1][1]}, {PSI[2][0][0],PSI[2][0][1]}, {PSI[1][2][0],PSI[1][2][1]}, }; offset_3d(GIMM , GaugeType, 3, 3 ,2 ); // Other 8 registers used for reduction variables in SU3. // Could use these in reconstruct?? int E[2] = { PSI[2][2][0],PSI[2][2][1]}; /* * FCD used for drain of Chi * Overlap with PSI[*][3][*] */ int F[2] = {PSI[0][3][0],PSI[0][3][1]}; int C[2] = {PSI[1][3][0],PSI[1][3][1]}; int D[2] = {PSI[2][3][0],PSI[2][3][1]}; /* * Integer registers */ alreg(psi,Iregs); alreg(Umu,Iregs); alreg(Ufetch,Iregs); alreg(Chiin,Iregs); alreg(Chiout,Iregs); alreg(Chifetch,Iregs); reg_array_1d(Chiplus,Iregs,4);/*Pointers to the 8 2-spinors for recombination*/ reg_array_1d(Chiminus,Iregs,4); alreg(mu,Iregs); alreg(Chidrain,Iregs); alreg(pref,Iregs); alreg(mem,Iregs); alreg(length,Iregs); int Isize = PROC->I_size; int Fsize = PROC->FP_size; def_off( ZERO_IMM, Byte,0); def_off( PSI_ATOM, FourSpinType, 24); def_off( CHI_ATOM, TwoSpinType, 12); def_off( PAD_CHI_ATOM, TwoSpinType, 16); def_off( MAT_IMM, GaugeType, 18); int Ndim = def_offset(4,Byte,"Ndim"); int Ndimm1 = def_offset(3,Byte,"Ndimm1"); int hbias,bias; /*Offsets handles to stack*/ int hbitbucket = def_offset(16*Isize,Byte,"hbitbucket"); int Tsize; if ( TwoSpinType == Double ) Tsize = PROC->FP_size; else Tsize = PROC->FSP_size; int hstk0 = def_offset(16*Isize+12*Tsize ,Byte,"hstk0"); int hstk1 = def_offset(16*Isize+2*12*Tsize,Byte,"hstk1"); int hstk2 = def_offset(16*Isize+3*12*Tsize,Byte,"hstk2"); int hstk3 = def_offset(16*Isize+4*12*Tsize,Byte,"hstk3"); int hIsize = def_offset(Isize,Byte,"Isize"); int i,co,j,k,nxt,ri,sp,nxtco,eop,eo_a,eo_b; /***********************************************************************/ /* * PROLOGUE */ make_inst(DIRECTIVE,Enter_Routine,name); /*Allocate stack save any callee save registers we need etc...*/ int stack_buf_size; stack_buf_size = 16*Isize + 12*Fsize * 5 ; hbias = grab_stack(stack_buf_size); bias = get_offset(hbias); save_regs(); queue_iadd_imm(mem,PROC->StackPointer,hbias); /*Pointer to buf on stack*/ /*Define our arguments - all pointers ala fortran*/ getarg(psi); getarg(Umu); getarg(Chiin); getarg(length); /*{... Process arguments ...*/ queue_iload(length,ZERO_IMM,length); /*Load in sx counter*/ retno = get_target_label(); /*Branch to exit if yzt <1*/ check_iterations(length,retno); need_cache_line(0); need_cache_line(1); need_cache_line(2); need_cache_line(3); need_cache_line(4); pragma(DCBT_SPACE,5); pragma(DCBT_POST,1); #define LOAD_U(comin,comax)\ /*Load two link rows*/\ for( i = comin;i<=comax;i++ ){\ for( ri=0;ri<2;ri++){ \ queue_fload(UA[i][ri],GIMM[i][0][ri],Umu,GaugeType);\ queue_fload(UB[i][ri],GIMM[i][1][ri],Umu,GaugeType);\ } \ } #define PRELOAD_U LOAD_U(0,1) #define POSTLOAD_U LOAD_U(2,2) PRELOAD_U #define LOAD_CHI(comin,comax) \ /*Load Chi column*/\ for( i = comin;i<=comax;i++ ){\ for( ri=0;ri<2;ri++){\ queue_fload(CHIR[i][0][ri],CHI_IMM[i][0][ri],Chiin,TwoSpinType);\ } \ for( ri=0;ri<2;ri++){\ queue_fload(CHIR[i][1][ri],CHI_IMM[i][1][ri],Chiin,TwoSpinType);\ } \ } #define PRELOAD_CHI LOAD_CHI(0,1) #define POSTLOAD_CHI LOAD_CHI(2,2) #define POSTLOAD \ POSTLOAD_CHI \ POSTLOAD_U do_prefetch(Chiin,0); do_prefetch(Chiin,1); if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chiin,2); PRELOAD_CHI /* * Start site loop */ queue_iadd_imm(Chidrain,mem,hbitbucket); branchsite = start_loop(length); queue_iadd_imm(Chiout,mem,hstk0); /* * Loop over mu in asm */ queue_iload_imm(mu,Ndimm1); #define CHIDRAIN \ queue_fstore(F[0],CHI_IMM[1][1][0],Chidrain,TwoSpinType);\ queue_fstore(F[1],CHI_IMM[1][1][1],Chidrain,TwoSpinType);\ queue_fstore(C[0],CHI_IMM[2][0][0],Chidrain,TwoSpinType);\ queue_fstore(C[1],CHI_IMM[2][0][1],Chidrain,TwoSpinType);\ queue_fstore(D[0],CHI_IMM[2][1][0],Chidrain,TwoSpinType);\ queue_fstore(D[1],CHI_IMM[2][1][1],Chidrain,TwoSpinType); #define PREFETCH_CHI \ queue_iadd_imm(Chifetch,Chiin,PAD_CHI_ATOM);\ do_prefetch(Chifetch,0);\ do_prefetch(Chifetch,1);\ if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2); #define PREFETCH_CHIF \ queue_iadd_imm(Chifetch,Chifetch,PAD_CHI_ATOM);\ do_prefetch(Chifetch,0);\ do_prefetch(Chifetch,1);\ if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2); for ( int unroll=0; unroll<2; unroll++ ) { if ( unroll==0 ) { branchmu = start_loop(mu); pragma(DCBT_SPACE,5); pragma(STORE_LIM,1); pragma(LOAD_LIM,2); } else { pragma(STORE_LIM,2); pragma(DCBT_SPACE,5); pragma(DCBT_POST,1); pragma(DCBT_PRE,0); pragma(LOAD_LIM,2); } CHIDRAIN POSTLOAD if ( unroll == 0 ) { PREFETCH_CHI queue_iadd_imm(Ufetch,Umu,MAT_IMM); do_prefetch(Ufetch,0); do_prefetch(Ufetch,1); do_prefetch(Ufetch,2); if ( GaugeType == Double ) { do_prefetch(Ufetch,3); do_prefetch(Ufetch,4); } } else { pragma(DCBT_SPACE,3); PREFETCH_CHI PREFETCH_CHIF PREFETCH_CHIF PREFETCH_CHIF } j=0; queue_three_cmuls(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); j=1; queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); j=2; queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1], E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]); /*Store the first three results*/ queue_fstore(C[0],CHI_IMM[0][0][0],Chiout,TwoSpinType); queue_fstore(C[1],CHI_IMM[0][0][1],Chiout,TwoSpinType); queue_fstore(D[0],CHI_IMM[0][1][0],Chiout,TwoSpinType); queue_fstore(D[1],CHI_IMM[0][1][1],Chiout,TwoSpinType); queue_fstore(E[0],CHI_IMM[1][0][0],Chiout,TwoSpinType); queue_fstore(E[1],CHI_IMM[1][0][1],Chiout,TwoSpinType); /*Load the third row*/ for(j=0; j<3; j++) { for(ri=0; ri<2; ri++) { queue_fload(UA[j][ri],GIMM[j][2][ri],Umu,GaugeType); } } /*Gauge layout is linear, mu faster than site*/ queue_iadd_imm(Umu,Umu,MAT_IMM); /*Now the second set of three cdots*/ j=0; queue_three_cmuls(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); j=1; queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); j=2; queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1], C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1], D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]); /**************END SU3 CODE *************/ queue_iadd_imm(Chiin,Chiin,PAD_CHI_ATOM); queue_iadd_imm(Chidrain,Chiout,ZERO_IMM); queue_iadd_imm(Chiout,Chiout,CHI_ATOM); if ( unroll == 0 ) { PRELOAD_U PRELOAD_CHI } /*********************************************************/ /****************** END OF SU3 MULTIPLY ******************/ /*********************************************************/ if ( unroll== 0 ) { stop_loop(branchmu,mu); /* End loop over mu*/ make_inst(DIRECTIVE,Target,get_target_label() ); /*delineate the sections*/ } } /*********************************************************/ /****************** START OF RECONSTRUCT *****************/ /*********************************************************/ //Address calculation... // Chiminus -> Stack and ChiPlus -> Chiin pragma(STORE_INORDER,1); queue_iadd_imm(Chiminus[0],mem,hstk0); /*For register use reasons loop over colour outermost*/ #define LOAD_CHI_MU0(eo,co) \ for( sp = 0; sp<2;sp++ ){\ for( ri = 0; ri<2;ri++ ){\ queue_fload(A[eo][sp][ri],CHI_IMM[co][sp][ri],Chiminus[0],TwoSpinType);\ if ( co == 0 ) {\ queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiin,TwoSpinType);\ queue_iadd_imm(Chiplus[0],Chiin,ZERO_IMM);\ } else {\ queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiplus [0],TwoSpinType);\ }\ }} pragma(LOAD_LIM,2); LOAD_CHI_MU0(0,0) pragma(DCBT_POST,1); CHIDRAIN int neo_a = NEO; int neo_b = NEO; eo_a = 0; eo_b = 0; for ( co = 0; co <3 ; co ++ ) { pragma(LOAD_LIM,1); if ( co == 0 ) { // Use the third colour for unrolling the loads A[1][0][0] = PSI[2][0][0]; A[1][0][1] = PSI[2][0][1]; A[1][1][0] = PSI[2][1][0]; A[1][1][1] = PSI[2][1][1]; B[1][0][0] = PSI[2][2][0]; B[1][0][1] = PSI[2][2][1]; B[1][1][0] = PSI[2][3][0]; B[1][1][1] = PSI[2][3][1]; queue_iadd_imm(Chiminus[1],mem,hstk1); // This is invariant of loop // Take out queue_iadd_imm(Chiplus[1],Chiin ,PAD_CHI_ATOM); } /*************************************************************** * MU = 0 reconstruct * ****************************************************************/ if ( co == 2 ) { // Flip to not unrolled due to register pressure neo_b = 1; neo_a = 2; A[1][0][0] = PSI[0][0][0]; A[1][0][1] = PSI[0][0][1]; A[1][1][0] = PSI[1][0][0]; A[1][1][1] = PSI[1][0][1]; pragma(DCBT_POST,0); pragma(DCBT_SPACE,1); queue_iadd_imm(Ufetch,Umu,ZERO_IMM); // do_prefetch(Ufetch,0); do_prefetch(Ufetch,1); do_prefetch(Ufetch,2); if ( GaugeType == Double ) { do_prefetch(Ufetch,3); do_prefetch(Ufetch,4); } } /* psi_0 = Chiplus[0] + Chiminus[0] */ /* psi_1 = Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],B[eo_b][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],B[eo_b][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],B[eo_b][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],B[eo_b][1][1],A[eo_a][1][1]); // Dagger = 0: /* psi_2 =-iChiplus[1] +iChiminus[1] */ /* psi_3 =-iChiplus[0] +iChiminus[0] */ // Dagger = 1: /* psi_2 = iChiplus[1] -iChiminus[1] */ /* psi_3 = iChiplus[0] -iChiminus[0] */ if ( dagger == 0 ) { queue_fsub(PSI[co][2][0],B[eo_b][1][1],A[eo_a][1][1]); queue_fsub(PSI[co][2][1],A[eo_a][1][0],B[eo_b][1][0]); queue_fsub(PSI[co][3][0],B[eo_b][0][1],A[eo_a][0][1]); queue_fsub(PSI[co][3][1],A[eo_a][0][0],B[eo_b][0][0]); } else { queue_fsub(PSI[co][2][0],A[eo_a][1][1],B[eo_b][1][1]); queue_fsub(PSI[co][2][1],B[eo_b][1][0],A[eo_a][1][0]); queue_fsub(PSI[co][3][0],A[eo_a][0][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][1],B[eo_b][0][0],A[eo_a][0][0]); } /*************************************************************** * MU = 1 reconstruct * ****************************************************************/ eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[1],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [1],TwoSpinType); } } if ( co == 0 ) { queue_iadd_imm(Chiminus[2],mem,hstk2); queue_iadd_imm(Chiminus[3],mem,hstk3); queue_iadd_imm(Chiplus[2],Chiplus[1],PAD_CHI_ATOM); queue_iadd_imm(Chiplus[3],Chiplus[2],PAD_CHI_ATOM); } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); //Dagger == 0 /* psi_2 += Chiplus[1] - Chiminus[1] */ /* psi_3 += -Chiplus[0] + Chiminus[0] */ //Dagger == 1 /* psi_2 -= Chiplus[1] - Chiminus[1] */ /* psi_3 -= -Chiplus[0] + Chiminus[0] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]); queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]); queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]); } /*************************************************************** * MU = 2 reconstruct * ****************************************************************/ eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[2],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [2],TwoSpinType); } } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); //Dagger == 0 /* psi_2 +=-iChiplus[0] +iChiminus[0] */ /* psi_3 += iChiplus[1] -iChiminus[1] */ //Dagger == 1 /* psi_2 -=-iChiplus[0] +iChiminus[0] */ /* psi_3 -= iChiplus[1] -iChiminus[1] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]); queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]); queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]); } /*************************************************************** * MU = 3 reconstruct * ****************************************************************/ pragma(LOAD_LIM,2); eo_a = (eo_a+1)%neo_a; eo_b = (eo_b+1)%neo_b; for( sp = 0; sp<2; sp++ ) { for( ri = 0; ri<2; ri++ ) { queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[3],TwoSpinType); queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [3],TwoSpinType ); } } /* psi_0 += Chiplus[0] + Chiminus[0] */ /* psi_1 += Chiplus[1] + Chiminus[1] */ queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]); //Dagger == 0 /* psi_2 += Chiplus[0] - Chiminus[0] */ /* psi_3 += Chiplus[1] - Chiminus[1] */ //Dagger == 1 /* psi_2 -= Chiplus[0] - Chiminus[0] */ /* psi_3 -= Chiplus[1] - Chiminus[1] */ if ( dagger == 0 ) { queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]); } else { queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]); } queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]); queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]); queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]); queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]); if ( dagger == 0 ) { queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]); queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]); queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]); queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]); } else { queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]); queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]); queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]); queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]); } /* * Store the spinors. If this is problematic * in terms of PEC WriteBuf misses, I could * store to the stack and copy out later. */ if ( co != 2 ) { LOAD_CHI_MU0(0,co+1) eo_a=0; eo_b=0; } queue_fstore(PSI[co][0][0],PSI_IMM[0][co][0],psi,FourSpinType); queue_fstore(PSI[co][0][1],PSI_IMM[0][co][1],psi,FourSpinType); } /* * Store out in linear order now */ pragma(STORE_LIM,2); pragma(DCBT_SPACE,8); for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][1][0],PSI_IMM[1][co][0],psi,FourSpinType); queue_fstore(PSI[co][1][1],PSI_IMM[1][co][1],psi,FourSpinType); } for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][2][0],PSI_IMM[2][co][0],psi,FourSpinType); queue_fstore(PSI[co][2][1],PSI_IMM[2][co][1],psi,FourSpinType); } if ( TwoSpinType == FourSpinType ) { queue_iadd_imm(Chidrain,psi,CHI_ATOM); } else { queue_iadd_imm(Chidrain,mem,hbitbucket); for ( co=0; co<3; co ++ ) { queue_fstore(PSI[co][3][0],PSI_IMM[3][co][0],psi,FourSpinType); queue_fstore(PSI[co][3][1],PSI_IMM[3][co][1],psi,FourSpinType); } } queue_iadd_imm(psi,psi,PSI_ATOM); /* * Put in an artificial dependency here * to try to stop the preloads getting above the last load of * reconstruct. */ queue_iadd_imm(Chiplus[3],Chiplus[3],ZERO_IMM); queue_iadd_imm(Chiin ,Chiplus[3],PAD_CHI_ATOM); pragma(DCBT_SPACE,0); do_prefetch(Chiin,0); do_prefetch(Chiin,1); if ( SizeofDatum(TwoSpinType) == 8 )do_prefetch(Chiin,2); PRELOAD_U PRELOAD_CHI /* TERMINATION point of the loop*/ stop_loop(branchsite,length); CHIDRAIN make_inst(DIRECTIVE,Target,retno); /* * * EPILOGUE * */ restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }
void dwf_deriv( char *name) { /* * This marks the argument registers as defined by ABI as off limits * to us until they are freed by "getarg()"; */ int dum = defargcount(1); int retno; /* * S=phi^dag (MdagM)^-1 phi * * dS = phi^dag (MdagM)^-1 [ dMdag M + Mdag dM ] (MdagM)^-1 phi * * Let X = (MdagM)^-1 phi * Y = M X = M^-dag phi * * Want terms: Ydag dM X * Xdag dMdag Y * * Take Xdag 1-gamma Y * * Still a bit confused about the 1+g 1-g terms; but this may be simply a factor of two as we add +h.c. * Will continue to follow Chroma's routine */ reg_array_2d(Y,Cregs,4,3); // 4 spinor - 24 regs reg_array_2d(X,Cregs,4,3); // 4 spinor - 12 regs reg_array_1d(F,Cregs,3); // Force alreg(Z,Cregs); // Zero alreg(creg,Cregs); // Zero offset_3d(CHIIMM,FourSpinType,2,3,2*nsimd()); offset_3d(PSIIMM,FourSpinType,4,3,2*nsimd()); offset_3d(GIMM ,GaugeType, 3, 3 ,2*nsimd() ); def_off( GAUGE_SITE_IMM, FourSpinType,4*18*nsimd()); def_off( MAT_IMM , GaugeType,18*nsimd()); def_off( PSI_IMM , FourSpinType,24*nsimd()); def_off( CHI_IMM , FourSpinType,12*nsimd()); def_off( CONST_ZERO_OFFSET,Double,2*2*nsimd()); /* * Integer registers */ alreg(F_p,Iregs); /*Pointer to the current cpt of force field */ alreg(F_p_s,Iregs); alreg(Y_mu,Iregs); alreg(Y_p,Iregs); alreg(X_p,Iregs); alreg(length,Iregs); /*number of sites*/ alreg(tab,Iregs); /*Pointer to current entry in offset table*/ alreg(Complex_i,Iregs);/*Point to (0,1)x Nsimd*/ alreg(Ls,Iregs); alreg(s,Iregs); alreg(recbuf_base,Iregs); alreg(args,Iregs); alreg(s_offset,Iregs); /*Useful integer immediate constants, in units of Fsize*/ def_off( ZERO_IMM,Byte,0); def_off( minusone,Byte,-1); def_off( one,Byte,1); // Mask bits for predicating directions def_off( mask_0,Byte,1); def_off( mask_1,Byte,2); def_off( mask_2,Byte,4); def_off( mask_3,Byte,8); def_off( mask_4,Byte,16); def_off( mask_5,Byte,32); def_off( mask_6,Byte,64); def_off( mask_7,Byte,128); int mask_imm[8] = { mask_0, mask_1, mask_2, mask_3, mask_4, mask_5, mask_6, mask_7 }; alreg(mask ,Iregs); offset_1d(TAB_IMM,TableType,17); // Integer sizes int Isize = def_offset(PROC->I_size,Byte,"Isize"); int ISsize = def_offset(PROC->IS_size,Byte,"ISsize"); int i,j,co,sp; /*********************************************************************/ make_inst(DIRECTIVE,Enter_Routine,name); grab_stack(0); save_regs(); /********************************************* * our arguments ********************************************* */ getarg(args); /*Pointer to arg list*/ queue_iload(X_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); //0 queue_iload(Y_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); //1 queue_iload(F_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); //2 queue_iload(length,ZERO_IMM,args); queue_load_addr(args,Isize,args); //3 queue_iload(Ls, ZERO_IMM,args); queue_load_addr(args,Isize,args); //4 queue_iload(tab, ZERO_IMM,args); queue_load_addr(args,Isize,args); //5 queue_iload(Complex_i,ZERO_IMM,args);queue_load_addr(args,Isize,args); //6 queue_load_addr(args,Isize,args); //7 queue_iload(recbuf_base,ZERO_IMM,args);queue_load_addr(args,Isize,args); //8 /************************************************** * Load common constants into Iregs ************************************************** */ for (int i =0; i<12; i++ ) { need_constant(i*2*SizeofDatum(FourSpinType)*nsimd()); } for (int i =0; i<9; i++ ) { need_constant(i*2*SizeofDatum(GaugeType)*nsimd()); } complex_constants_prepare(creg,Complex_i); complex_load(Z,CONST_ZERO_OFFSET,Complex_i,Double); // Site loop retno = get_target_label(); check_iterations(length,retno); int branchsite = start_loop(length); // S loop queue_iload_short(mask,TAB_IMM[10],tab); queue_iadd_imm (s,Ls,ZERO_IMM); queue_iload_imm(s_offset,ZERO_IMM); int branchls = start_loop(s); queue_iadd_imm(F_p_s,F_p,ZERO_IMM); // debugI(s); // Loop over directions for ( int mu=0;mu<4;mu++ ) { int dir = mu*2+1; // Always in forward dir // Complex branch structure for interior/exterior neighbours int lab_proj_mu = get_target_label(); int lab_continue = get_target_label(); queue_iand_imm (Y_mu,mask,mask_imm[dir]); // non-zero if exterior check_iterations(Y_mu,lab_proj_mu); // Exterior points are already projected. Just load. queue_iload_short(Y_mu,TAB_IMM[dir],tab); queue_iadd (Y_mu,Y_mu,recbuf_base); // debugI(Y_mu); //debugI(recbuf_base); queue_iadd (Y_mu,Y_mu,s_offset); for(int sp=0;sp<2;sp++){ for(int co=0;co<3;co++){ complex_load(Y[sp][co],PSIIMM[sp][co][0],Y_mu,FourSpinType); } } jump(lab_continue); make_inst(DIRECTIVE,Target,lab_proj_mu); // Interior points are not already projected. // * Spin project 4 spinor queue_iload_short(Y_mu,TAB_IMM[dir],tab); // debugI(tab); // debugI(Y_mu); queue_iadd (Y_mu,Y_mu,Y_p); queue_iadd (Y_mu,Y_mu,s_offset); // offset for this "s" queue_iadd (Y_mu,Y_mu,s_offset); // offset for this "s" for(int sp=0;sp<4;sp++){ for(int co=0;co<3;co++){ complex_load(X[sp][co],PSIIMM[sp][co][0],Y_mu,FourSpinType); // debugC(X[sp][co]); } } int pm = 1; // pm=0 == 1+gamma, pm=1 => 1-gamma if ( dagger ) pm = 0; if ( mu == 0 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_ApiB(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_ApiB(Y[1][co],X[1][co],X[2][co]); } else { for(co=0;co<3;co++) complex_AmiB(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_AmiB(Y[1][co],X[1][co],X[2][co]); } } else if ( mu == 1 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_sub(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_add(Y[1][co],X[1][co],X[2][co]); } else { for(co=0;co<3;co++) complex_add(Y[0][co],X[0][co],X[3][co]); for(co=0;co<3;co++) complex_sub(Y[1][co],X[1][co],X[2][co]); } } else if ( mu == 2 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_ApiB(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_AmiB(Y[1][co],X[1][co],X[3][co]); } else { for(co=0;co<3;co++) complex_AmiB(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_ApiB(Y[1][co],X[1][co],X[3][co]); } } else if ( mu == 3 ) { if ( pm ==0 ) { for(co=0;co<3;co++) complex_add(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_add(Y[1][co],X[1][co],X[3][co]); } else { for(co=0;co<3;co++) complex_sub(Y[0][co],X[0][co],X[2][co]); for(co=0;co<3;co++) complex_sub(Y[1][co],X[1][co],X[3][co]); } } make_inst(DIRECTIVE,Target,lab_continue); /////////////////////////////////////////////////////////////// // Y contains spin projection of forward neighbour in mu direction // Repromote to Y to 4 spinor /////////////////////////////////////////////////////////////// for(int co_y=0;co_y<3;co_y++){ if ( (mu==0) && (pm==0) ) complex_AmiB(Y[2][co_y],Z,Y[1][co_y]); if ( (mu==0) && (pm==1) ) complex_ApiB(Y[2][co_y],Z,Y[1][co_y]); if ( (mu==1) && (pm==0) ) complex_add (Y[2][co_y],Z,Y[1][co_y]); if ( (mu==1) && (pm==1) ) complex_sub (Y[2][co_y],Z,Y[1][co_y]); if ( (mu==2) && (pm==0) ) complex_AmiB(Y[2][co_y],Z,Y[0][co_y]); if ( (mu==2) && (pm==1) ) complex_ApiB(Y[2][co_y],Z,Y[0][co_y]); if ( (mu==3) && (pm==0) ) complex_add (Y[2][co_y],Z,Y[0][co_y]); if ( (mu==3) && (pm==1) ) complex_sub (Y[2][co_y],Z,Y[0][co_y]); if ( (mu==0) && (pm==0) ) complex_AmiB(Y[3][co_y],Z,Y[0][co_y]); if ( (mu==0) && (pm==1) ) complex_ApiB(Y[3][co_y],Z,Y[0][co_y]); if ( (mu==1) && (pm==0) ) complex_sub (Y[3][co_y],Z,Y[0][co_y]); if ( (mu==1) && (pm==1) ) complex_add (Y[3][co_y],Z,Y[0][co_y]); if ( (mu==2) && (pm==0) ) complex_ApiB(Y[3][co_y],Z,Y[1][co_y]); if ( (mu==2) && (pm==1) ) complex_AmiB(Y[3][co_y],Z,Y[1][co_y]); if ( (mu==3) && (pm==0) ) complex_add (Y[3][co_y],Z,Y[1][co_y]); if ( (mu==3) && (pm==1) ) complex_sub (Y[3][co_y],Z,Y[1][co_y]); } /////////////////////////////////////////////////////////////// // Load X /////////////////////////////////////////////////////////////// for(int co_x=0;co_x<3;co_x++){ for(int sp=0;sp<4;sp++) { complex_load(X[sp][co_x],PSIIMM[sp][co_x][0],X_p,FourSpinType); } } /////////////////////////////////////////////////////////////// // Spin trace tensor product /////////////////////////////////////////////////////////////// for(int co_x=0;co_x<3;co_x++){ // Spin trace outer product for ( int co_y=0;co_y<3;co_y++) complex_load (F[co_y],GIMM[co_y][co_x][0],F_p_s); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[0][co_x],Y[0][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[1][co_x],Y[1][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[2][co_x],Y[2][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_conjmadd(F[co_y],X[3][co_x],Y[3][co_y]); for ( int co_y=0;co_y<3;co_y++) complex_store(F[co_y],GIMM[co_y][co_x][0],F_p_s); } queue_load_addr(F_p_s,MAT_IMM,F_p_s); } queue_iadd_imm(X_p,X_p,PSI_IMM); queue_iadd_imm(s_offset,s_offset,CHI_IMM); stop_loop(branchls,s); queue_iadd_imm(F_p,F_p_s,ZERO_IMM); queue_load_addr(tab,TAB_IMM[16],tab); stop_loop(branchsite,length); make_inst(DIRECTIVE,Target,retno); /* * * EPILOGUE * */ restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }
void qcdoc_merge( char *name) { int dum = defargcount(5); /*Integer register usage*/ alreg(outptr,Iregs); alreg(vec1ptr,Iregs); alreg(vec2ptr,Iregs); alreg(counter,Iregs); /*Floating register usage*/ reg_array_1d(vec1,Cregs,3); reg_array_1d(vec2,Cregs,3); reg_array_1d(oreg,Cregs,6); alreg(permreg,Cregs); def_off(ZERO,SpinorType,0); def_off (IN_ATOM,SpinorType,6*nsimd()); // 2spins worth, 3 colors x complex def_off (OUT_ATOM,SpinorType,12*nsimd());// 2spins worth, 3 colors x complex x simd def_off(bits16,Byte,0xFFFF); def_off(thirtytwo,Byte,32); def_off(sixteen,Byte,16); offset_2d(CHI_IMM,SpinorType,6,2*nsimd()); int Isize = PROC->I_size; int word_size = def_offset(Isize,Byte,"word_size"); struct stream *PreOut; struct stream *PreVec1; struct stream *PreVec2; int brchno,retno; /*Branch target handles*/ int co; make_inst(DIRECTIVE,Enter_Routine,name); int bias = grab_stack(64); save_regs(); queue_iadd_imm(PROC->StackPointer,PROC->StackPointer,bias); getarg(outptr); /*Get args*/ getarg(vec1ptr); getarg(vec2ptr); getarg(counter); alreg(Mask,Iregs); alreg(Convert1,Iregs); alreg(Convert2,Iregs); int memory = PROC->StackPointer; for (int i =0; i<6; i++ ) { need_constant(i*2*SizeofDatum(SpinorType)*nsimd()); } need_constant(64); complex_simd_init(permreg); if ( half_precision ) { queue_iload_imm(Mask,ZERO); queue_ori(Mask,Mask,bits16); queue_lshift(Mask,Mask,thirtytwo); queue_ori(Mask,Mask,bits16); queue_lshift(Mask,Mask,sixteen); } /* * Insert a label to prevent reordering */ make_inst(DIRECTIVE,Target,get_target_label()); PreVec1= create_stream(IN_ATOM,vec1ptr ,counter,STREAM_IN ,LINEAR); PreVec2= create_stream(IN_ATOM,vec2ptr ,counter,STREAM_IN ,LINEAR); PreOut = create_stream(OUT_ATOM,outptr ,counter,STREAM_OUT ,LINEAR); /*Branch to stack restore if length <1*/ retno = get_target_label(); check_iterations(counter,retno); /* * Start software pipeline */ brchno = start_loop(counter); int indco[3]={0,1,2}; int permute_mu=3; for(int ico=0;ico<3;ico++){ co = indco[ico]; // Could do entirely in integer unit for half precision to accelerate this if ( half_precision ) { complex_load_half(vec1[co],CHI_IMM[co][0],vec1ptr,memory,Convert1,Convert2,Mask); complex_load_half(vec2[co],CHI_IMM[co][0],vec2ptr,memory,Convert1,Convert2,Mask); } else { complex_load(vec1[co],CHI_IMM[co][0],vec1ptr,SpinorType); complex_load(vec2[co],CHI_IMM[co][0],vec2ptr,SpinorType); } } { // Merge the vectors for(co=0;co<3;co++) complex_simd_merge (0,permute_mu,oreg[co*2] ,vec1[co],vec2[co]); for(co=0;co<3;co++) complex_simd_merge (1,permute_mu,oreg[co*2+1],vec1[co],vec2[co]); } // make_inst(DIRECTIVE,LS_BARRIER); for(int i=0;i<6;i++){ // 2 SIMD sites, 3 colors, 2 spins 2 complex == 24 floats if ( half_precision ) { complex_store_half(oreg[i],CHI_IMM[i][0],outptr,memory,Convert1,Convert2,Mask); } else { complex_store(oreg[i],CHI_IMM[i][0],outptr,SpinorType); } } iterate_stream(PreVec1); iterate_stream(PreVec2); do_prefetch(vec1ptr,0); do_prefetch(vec2ptr,0); do_prefetch(vec1ptr,1); do_prefetch(vec2ptr,1); iterate_stream(PreOut); stop_loop(brchno,counter); make_inst(DIRECTIVE,Target,retno); queue_isub_imm(PROC->StackPointer,PROC->StackPointer,bias); restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }
void clov_apply( char *name) { /* * This marks the argument registers as defined by ABI as off limits * to us until they are freed by "getarg()"; */ int dum = defargcount(1); int retno; int branchsite; /*------------------------------------------------------------------------------- * registers used *------------------------------------------------------------------------------- */ reg_array_1d(Chi ,Cregs,6); // 2 spionr - 6 regs reg_array_1d(Psi ,Cregs,6); // 2 spinors - 6 regs reg_array_1d(Clo ,Cregs,15); offset_2d(PSIIMM,FourSpinType,6,2*nsimd()); offset_2d(CLOIMM,CloverType,21,2*nsimd()); /* * Integer registers */ alreg(Clo_p,Iregs); /*Pointer to the current cpt of gauge field */ alreg(Chi_p,Iregs); /*Pointer to the input four spinor */ alreg(Psi_p,Iregs); /*Pointer to current cpt output PSI field */ alreg(length,Iregs); /*number of sites*/ //alreg(tab,Iregs); /*Pointer to current entry in offset table*/ alreg(args,Iregs); /*Useful integer immediate constants, in units of Fsize*/ def_off(ZERO_IMM,Byte,0); def_off(SPINOR, FourSpinType, 12*nsimd()); def_off(CLOVER, CloverType, 42*nsimd()); int Isize = def_offset(PROC->I_size,Byte,"Isize"); /*-------------------------------------------------------------------- * Start of the "pseudo assembler proper. *-------------------------------------------------------------------- */ make_inst(DIRECTIVE,Enter_Routine,name); grab_stack(0); save_regs(); /* * Define our arguments */ getarg(args); /*Pointer to arg list*/ queue_iload(Chi_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); queue_iload(Psi_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); queue_iload(Clo_p, ZERO_IMM,args); queue_load_addr(args,Isize,args); queue_iload(length,ZERO_IMM,args); for (int i =0; i<6; i++ ) { need_constant(i*2*SizeofDatum(FourSpinType)*nsimd()); } for (int i =0; i<21; i++ ) { need_constant(i*2*SizeofDatum(CloverType)*nsimd()); } retno = get_target_label(); /*Branch to exit if length <1*/ check_iterations(length,retno); /* * Site loop */ branchsite = start_loop(length); for(int pp=0; pp<6; pp++) { complex_load(Psi[pp],PSIIMM[pp][0],Psi_p,FourSpinType); } for(int qq=0; qq<6; qq++) { complex_load(Clo[qq],CLOIMM[qq][0],Clo_p,CloverType); } for(int rr=0; rr<6; rr++) { complex_mul (Chi[rr], Clo[rr], Psi[rr]); } for(int ss=0; ss<15; ss++) { complex_load(Clo[ss],CLOIMM[ss+6][0],Clo_p,CloverType); } complex_conjmadd(Chi[0], Clo[0], Psi[1]); complex_conjmadd(Chi[0], Clo[1], Psi[2]); complex_conjmadd(Chi[0], Clo[3], Psi[3]); complex_conjmadd(Chi[0], Clo[6], Psi[4]); complex_conjmadd(Chi[0], Clo[10], Psi[5]); complex_madd (Chi[1], Clo[0], Psi[0]); complex_conjmadd(Chi[1], Clo[2], Psi[2]); complex_conjmadd(Chi[1], Clo[4], Psi[3]); complex_conjmadd(Chi[1], Clo[7], Psi[4]); complex_conjmadd(Chi[1], Clo[11], Psi[5]); complex_madd (Chi[2], Clo[1], Psi[0]); complex_madd (Chi[2], Clo[2], Psi[1]); complex_conjmadd(Chi[2], Clo[5], Psi[3]); complex_conjmadd(Chi[2], Clo[8], Psi[4]); complex_conjmadd(Chi[2], Clo[12], Psi[5]); complex_madd (Chi[3], Clo[3], Psi[0]); complex_madd (Chi[3], Clo[4], Psi[1]); complex_madd (Chi[3], Clo[5], Psi[2]); complex_conjmadd(Chi[3], Clo[9], Psi[4]); complex_conjmadd(Chi[3], Clo[13], Psi[5]); complex_madd (Chi[4], Clo[6], Psi[0]); complex_madd (Chi[4], Clo[7], Psi[1]); complex_madd (Chi[4], Clo[8], Psi[2]); complex_madd (Chi[4], Clo[9], Psi[3]); complex_conjmadd(Chi[4], Clo[14], Psi[5]); complex_madd (Chi[5], Clo[10], Psi[0]); complex_madd (Chi[5], Clo[11], Psi[1]); complex_madd (Chi[5], Clo[12], Psi[2]); complex_madd (Chi[5], Clo[13], Psi[3]); complex_madd (Chi[5], Clo[14], Psi[4]); for(int sp=0; sp<6; sp++) { complex_store(Chi[sp],PSIIMM[sp][0],Chi_p,FourSpinType); } queue_iadd_imm(Psi_p,Psi_p,SPINOR); queue_iadd_imm(Chi_p,Chi_p,SPINOR); queue_iadd_imm(Clo_p,Clo_p,CLOVER); /* TERMINATION point of the loop*/ stop_loop(branchsite,length); make_inst(DIRECTIVE,Target,retno); /* EPILOGUE */ restore_regs(); free_stack(); make_inst(DIRECTIVE,Exit_Routine,name); return; }