Example #1
0
File: nec.c Project: opicron/mame
static CPU_EXECUTE( necv )
{
	nec_state_t *nec_state = get_safe_token(device);
	int prev_ICount;

	if (nec_state->halted)
	{
		nec_state->icount = 0;
		debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip);
		return;
	}

	while(nec_state->icount>0) {
		/* Dispatch IRQ */
		if (nec_state->pending_irq && nec_state->no_interrupt==0)
		{
			if (nec_state->pending_irq & NMI_IRQ)
				external_int(nec_state);
			else if (nec_state->IF)
				external_int(nec_state);
		}

		/* No interrupt allowed between last instruction and this one */
		if (nec_state->no_interrupt)
			nec_state->no_interrupt--;

		debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip);
		prev_ICount = nec_state->icount;
		nec_instruction[fetchop(nec_state)](nec_state);
		do_prefetch(nec_state, prev_ICount);
	}
}
Example #2
0
File: nec.cpp Project: RalfVB/mame
void nec_common_device::execute_run()
{
	int prev_ICount;

	if (m_halted)
	{
		m_icount = 0;
		debugger_instruction_hook(this, (Sreg(PS)<<4) + m_ip);
		return;
	}

	while(m_icount>0) {
		/* Dispatch IRQ */
		if (m_pending_irq && m_no_interrupt==0)
		{
			if (m_pending_irq & NMI_IRQ)
				external_int();
			else if (m_IF)
				external_int();
		}

		/* No interrupt allowed between last instruction and this one */
		if (m_no_interrupt)
			m_no_interrupt--;

		debugger_instruction_hook(this, (Sreg(PS)<<4) + m_ip);
		prev_ICount = m_icount;
		(this->*s_nec_instruction[fetchop()])();
		do_prefetch(prev_ICount);
	}
}
Example #3
0
int nec_execute(int cycles)
{
	nec_state_t *nec_state = sChipsPtr;
	int prev_ICount;

	nec_state->icount = cycles;
	nec_state->cycles_remaining = cycles;

	if (nec_state->halted)
	{
		nec_state->icount = 0;
		//debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip);
		return cycles;
	}

	while((nec_state->icount>0) && (!nec_state->stop_run)) {
		/* Dispatch IRQ */
		if (nec_state->pending_irq && nec_state->no_interrupt==0)
		{
			if (nec_state->pending_irq & NMI_IRQ)
				external_int(nec_state);
			else if (nec_state->IF)
				external_int(nec_state);
		}

		/* No interrupt allowed between last instruction and this one */
		if (nec_state->no_interrupt)
			nec_state->no_interrupt--;

		//debugger_instruction_hook(device, (Sreg(PS)<<4) + nec_state->ip);
		prev_ICount = nec_state->icount;
		nec_instruction[fetchop(nec_state)](nec_state);
		do_prefetch(nec_state, prev_ICount);
	}

	nec_state->cycles_total += cycles - nec_state->icount;
	nec_state->cycles_remaining = 0;
	nec_state->stop_run = 0;

	return (cycles - nec_state->icount);
}
Example #4
0
void prefetch_irritator(void *arg)
{
    int i, rc, no_of_pages, tid , thread_no, tc, oper , number_of_operations;
    unsigned long long  saved_seed, random_no , starting_address , memory_fetch_size;
	pthread_t ptid;
	unsigned char *start_addr;
    struct thread_context *th = (struct thread_context *)arg;
 	struct ruleinfo *current_rule 	= th->current_rule;
	int cache_type = current_rule->tgt_cache;
	int cache_line_size = system_information.cinfo[cache_type].line_size;
	unsigned int loop_count 	;
	long int	offset;
	unsigned long long temp_storage = 0x1, temp_pattern = 0x1;

    /*
     * char *contig_mem[NUM_SEGS*SEG_SIZE/(16*M)]; Physically contiguous
     * memory pointer. memory_set_size variable  gives total memory
     * allocated both are variables of global structure.
     */

    thread_no = th->thread_no ;
	int pcpu = pcpus_thread_wise[thread_no];
    tid       = th->bind_to_cpu;    /* Bind to the processor */
	ptid	  = th->tid;			/* PThread Id for this thread	*/
	prefetch_streams 	= th->prefetch_streams;		/* Number of prefetch streams for this thread. 	*/

    if (current_rule->testcase_type != PREFETCH_ONLY) {
        /* Set Thread Cancel Type as ASYNCHRONOUS */
        pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
    }

    #ifdef __HTX_LINUX__
       /*printf(" Prefetch:calling htx_bind with pcpu=%d for thread_no= %d\n",pcpu,thread_no);*/
    if(pcpu == -1){
        pcpu = htx_bind_thread(tid, -1);
        rc = pcpu;
        pcpus_thread_wise[thread_no]=pcpu;
        if(pcpu < 0){
            pcpus_thread_wise[thread_no]= -1;
        }

    }
    else {
        rc = htx_bind_thread(tid,pcpu);
    }
    #else
        rc = bindprocessor(BINDTHREAD, thread_self(), tid);
    #endif
		DEBUG_LOG("[%d] thread %d, binding to cpu %d \n",__LINE__,thread_no,tid);
	    
		if(rc < 0) {
	#ifdef __HTX_LINUX__
            if( rc == -2) {
				tot_thread_count --;
                sprintf(msg,"lcpu:%d(pcpu=%d) prefetch has been hot removed, thread will be terminating now tot_thread_count=%d\n",tid,pcpu,tot_thread_count);
                hxfmsg(&h_d, errno, HTX_HE_INFO, msg);
                pthread_exit(NULL);
            }
            else {
                sprintf(msg, "%d: Bindprocessor for prefetch irritator on lcpu:%d and corresponding pcpu:%d failed with rc=%d\n", __LINE__, tid,pcpu,rc);
                hxfmsg(&h_d, errno, HTX_HE_HARD_ERROR, msg);
            }

		#else
        sprintf(msg, "Binding to cpu:%d  failed with errno: %d \n",tid,errno);
        hxfmsg(&h_d, errno, HTX_HE_SOFT_ERROR, msg);

		#endif

        } /* End of if */
        else {
			/*sprintf(msg,"::physical cpu:%d for log cpu:%d\n",pcpu,tid);
            hxfmsg(&h_d, rc , HTX_HE_INFO, msg);*/
            #ifdef DEBUG
            sprintf(msg,"[%d] Bindprocessor success [prefetch thread_bo %d]! cpu_no : %d , pthread id : 0x%x \n",__LINE__,thread_no,tid,ptid);
            hxfmsg(&h_d, errno, HTX_HE_INFO, msg);
            #endif
        }

        th->seedval = time(NULL);
        srand48_r(th->seedval,&th->buffer);

       number_of_operations = current_rule->num_oper;

		starting_address = (unsigned long long)(th_array[thread_no].start_of_contiguous_memory);
		memory_fetch_size = current_rule->prefetch_memory_size - BYTES_EXC ;
		loop_count	= memory_fetch_size / cache_line_size;
       for (oper = 0; oper < number_of_operations ; oper++) {

        /* if SIGTERM was received, exit */
        if(exit_flag != 0) {
            break;
        }

        random_no = get_random_number_perf(thread_no);
        random_no = (unsigned long long)(random_no<<32) | (random_no);
		/*random_no = 0xaabbccdd;
		random_no = th_array[thread_no].random_pattern;*/
        th_array[thread_no].prev_seed = random_no;

		/* Now write DSCR if needed */
		if ( system_information.pvr >= POWER8_MURANO ) {
			prefetch_randomise_dscr(random_no, th->current_rule->pf_dscr , thread_no);
		}

        if (th_array[thread_no].prefetch_algorithm == RR_ALL_ENABLED_PREFETCH_ALGORITHMS) {
            /* Run all the enabled prefetch variants in round robin method */

            /* If prefetch nstride is set in the current prefetch configuration */
            if ( (PREFETCH_NSTRIDE & current_rule->pf_conf) == PREFETCH_NSTRIDE ) {
                n_stride(starting_address,memory_fetch_size,random_no,&th_array[thread_no].prefetch_scratch_mem[0]);
            }

        	/* if SIGTERM was received, exit */
            if (exit_flag != 0) {
                break;
            }

            /* If prefetch partial is set in the current prefetch configuration */
            if ( (PREFETCH_PARTIAL & current_rule->pf_conf) == PREFETCH_PARTIAL ) {
                partial_dcbt(starting_address,memory_fetch_size,random_no,&th_array[thread_no].prefetch_scratch_mem[0]);
            }

        	/* if SIGTERM was received, exit */
            if (exit_flag != 0) {
                break;
            }

            if ( (PREFETCH_IRRITATOR & current_rule->pf_conf) == PREFETCH_IRRITATOR ) {
				rc = do_prefetch( starting_address , memory_fetch_size , random_no, thread_no, loop_count, th_array[thread_no].pattern);
				if ( rc != 0 ) {
					sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n"
								,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size);
					hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg);
					dump_miscompare_data(thread_no, (unsigned char *)starting_address);
					return;
				}
                /*prefetch(starting_address,memory_fetch_size,random_no,&th_array[thread_no].prefetch_scratch_mem[0]);*/
            }

        	/* if SIGTERM was received, exit */
            if (exit_flag != 0) {
                break;
            }

            if( (PREFETCH_TRANSIENT & current_rule->pf_conf) == PREFETCH_TRANSIENT ) {
				/*lrand48_r(&th->buffer, &offset);*/
				offset		= random_no % (long)16;

				start_addr = (unsigned char *)starting_address + offset;
				rc = transient_dcbt((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern );
				if ( rc != 0 ) {
					sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n"
								,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size);
					hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg);
					dump_miscompare_data(thread_no, (unsigned char *)starting_address);
					return;
				}
            }

        	/* if SIGTERM was received, exit */
            if (exit_flag != 0) {
                break;
            }

			if ( (PREFETCH_NA & current_rule->pf_conf) == PREFETCH_NA ) {

				/*lrand48_r(&th->buffer, &offset);*/
				offset		= random_no % (long)16;
				start_addr = (unsigned char *)starting_address + offset;
				rc = prefetch_dcbtna((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern,&temp_storage,&temp_pattern);
				if ( rc != 0 ) {
					sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x copied data = %x0x, copied pattern = %x0x, thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n"
								,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), temp_storage, temp_pattern, thread_no, starting_address, memory_fetch_size);
					hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg);
					dump_miscompare_data(thread_no, (unsigned char *)starting_address);
					return;
				}
			}
            if (exit_flag != 0) {
                break;
            }

        }
        else { /* Else Run only the specified algorithm */
            /*starting_address = (unsigned long long)(th_array[thread_no].start_of_contiguous_memory);
            memory_fetch_size = current_rule->prefetch_memory_size - BYTES_EXC ;*/

            if(th_array[thread_no].prefetch_algorithm == PREFETCH_NSTRIDE) {
				/*lrand48_r(&th->buffer, &random_no);*/
                n_stride(starting_address, memory_fetch_size, random_no, &th_array[thread_no].prefetch_scratch_mem[0]);
            }
            else if(th_array[thread_no].prefetch_algorithm == PREFETCH_PARTIAL) {
                partial_dcbt(starting_address, memory_fetch_size, random_no, &th_array[thread_no].prefetch_scratch_mem[0]);
            }
            else if(th_array[thread_no].prefetch_algorithm == PREFETCH_TRANSIENT) {
				/*lrand48_r(&th->buffer, &offset);*/
				offset		= random_no % (long)16;
				start_addr = (unsigned char *)starting_address + offset;

				rc = transient_dcbt((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern );
				if ( rc != 0 ) {
					sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n"
								,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size);
					hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg);
					dump_miscompare_data(thread_no, (unsigned char *)starting_address);
					return;
				}
            }
            else if(th_array[thread_no].prefetch_algorithm == PREFETCH_IRRITATOR) {
				rc = do_prefetch( starting_address , memory_fetch_size , random_no, thread_no, loop_count, th_array[thread_no].pattern);
				if ( rc != 0 ) {
					sprintf(msg,"[%d] Miscompare in Prefetch!! Expected data = 0x%x Actual data = 0x%x thread_index : 0x%x Start of memory = %p, memory size = 0x%x\n"
								,__LINE__,th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)starting_address + 128*(loop_count-rc)), thread_no, starting_address, memory_fetch_size);
					hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg);
					dump_miscompare_data(thread_no, (unsigned char *)starting_address);
					return;
				}
            }
			else if ( th_array[thread_no].prefetch_algorithm == PREFETCH_NA ) {
				/*lrand48_r(&th->buffer, &offset);*/
				offset		= random_no % (long)16;
				start_addr = (unsigned char *)starting_address + offset;
				rc = prefetch_dcbtna((unsigned long long)start_addr, loop_count, th_array[thread_no].pattern,&temp_storage, &temp_pattern);
				if ( rc != 0 ) {
					sprintf(msg,"[%d] Miscompare in Prefetch ( returned %d)!! Expected data = 0x%x Actual data = 0x%x copied data = 0x%x, copied pattern = 0x%x, thread_index : 0x%x Start of memory = %p, offset = %d\n"
								,__LINE__, rc, th_array[thread_no].pattern, *(unsigned long long *)((unsigned char *)start_addr + 128*(loop_count-rc)), temp_storage, temp_pattern, thread_no, starting_address, offset);
					hxfmsg(&h_d, 0, HTX_HE_MISCOMPARE, msg);
					dump_miscompare_data(thread_no, (unsigned char *)starting_address);
					return;
				}
			}

        	/* if SIGTERM was received, exit */
            if(exit_flag != 0) {
                break;
            }
        }

    } /* End of for loop */
	#ifdef __HTX_LINUX__
        /* Restore original/default CPU affinity so that it binds to ANY available processor */

        rc = htx_unbind_thread();
	#else
        rc = bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
	#endif
        if(rc == -1) {
                sprintf(msg, "%d: Unbinding from cpu:%d failed with errno %d \n",__LINE__, tid, errno);
                hxfmsg(&h_d, errno, HTX_HE_SOFT_ERROR, msg);
        }
	
#if defined(__HTX_MAMBO__) || defined(AWAN)
	printf("[%d] Thread no: %d, completed passes : %d\n",__LINE__, thread_no, oper);
#endif
}
Example #5
0
void qcdoc_su3_recon( char *name)
{
    /****  This section defines all the registers and offsets I need ****/

    /*
     * This marks the argument registers as defined by ABI as off limits
     * to us until they are freed by "getarg()";
     */
    int dum = defargcount(4);

    /*Handle for the loop entry point*/
    int branchsite;
    int branchmu;
    int retno ;

    /*------------------------------------------------------------------
     * Floating point registers
     *------------------------------------------------------------------
     */

    // Reconstruct 8 registers for 4 spinor
    //  reg_array_2d(PSI,Fregs,4,2);
    reg_array_3d(PSI,Fregs,3,4,2);
    offset_3d(PSI_IMM,FourSpinType,4,3,2);    /*Offsets within 4 spinor*/

    // Reconstruct 2 spinor registers
#define  NEO 2
    reg_array_3d(Atmp,Fregs,1,2,2); /*CHIplus  regs */
    reg_array_3d(Btmp,Fregs,1,2,2); /*CHIminus regs */
    int A[NEO][2][2] = {
        Atmp[0][0][0],    Atmp[0][0][1],
        Atmp[0][1][0],    Atmp[0][1][1],
        -1,-1,-1,-1
    };
    int B[NEO][2][2] = {
        Btmp[0][0][0],    Btmp[0][0][1],
        Btmp[0][1][0],    Btmp[0][1][1],
        -1,-1,-1,-1
    };

    /*Regs for SU3 two spinor multiply ... overlap with the reconstruct*/
    /*                                                      registers  */
    int CHIR[3][2][2] = {
        A[0][0][0],A[0][0][1],
        A[0][1][0],A[0][1][1],
        B[0][0][0],B[0][0][1],
        B[0][1][0],B[0][1][1],
        PSI[0][0][0],PSI[0][0][1],
        PSI[0][1][0],PSI[0][1][1]
    };
    offset_3d(CHI_IMM,TwoSpinType,3,2,2);

    /*Registers for the gauge link (2 rows)*/
    int UA[3][2] = {
        {PSI[0][2][0],PSI[0][2][1]},
        {PSI[2][1][0],PSI[2][1][1]},
        {PSI[1][0][0],PSI[1][0][1]}
    };
    int UB[3][2] = {
        {PSI[1][1][0],PSI[1][1][1]},
        {PSI[2][0][0],PSI[2][0][1]},
        {PSI[1][2][0],PSI[1][2][1]},
    };
    offset_3d(GIMM    , GaugeType, 3, 3 ,2 );

    // Other 8 registers used for reduction variables in SU3.
    // Could use these in reconstruct??
    int E[2] = { PSI[2][2][0],PSI[2][2][1]};

    /*
     * FCD used for drain of Chi
     * Overlap with PSI[*][3][*]
     */
    int F[2] = {PSI[0][3][0],PSI[0][3][1]};
    int C[2] = {PSI[1][3][0],PSI[1][3][1]};
    int D[2] = {PSI[2][3][0],PSI[2][3][1]};

    /*
     * Integer registers
     */
    alreg(psi,Iregs);
    alreg(Umu,Iregs);
    alreg(Ufetch,Iregs);

    alreg(Chiin,Iregs);
    alreg(Chiout,Iregs);

    alreg(Chifetch,Iregs);

    reg_array_1d(Chiplus,Iregs,4);/*Pointers to the 8 2-spinors for recombination*/
    reg_array_1d(Chiminus,Iregs,4);

    alreg(mu,Iregs);
    alreg(Chidrain,Iregs);
    alreg(pref,Iregs);

    alreg(mem,Iregs);
    alreg(length,Iregs);

    int Isize = PROC->I_size;
    int Fsize = PROC->FP_size;

    def_off( ZERO_IMM, Byte,0);
    def_off( PSI_ATOM, FourSpinType, 24);
    def_off( CHI_ATOM, TwoSpinType, 12);
    def_off( PAD_CHI_ATOM, TwoSpinType, 16);
    def_off( MAT_IMM, GaugeType, 18);

    int Ndim   = def_offset(4,Byte,"Ndim");
    int Ndimm1 = def_offset(3,Byte,"Ndimm1");
    int hbias,bias;

    /*Offsets handles to stack*/
    int hbitbucket = def_offset(16*Isize,Byte,"hbitbucket");
    int Tsize;
    if ( TwoSpinType == Double ) Tsize = PROC->FP_size;
    else Tsize = PROC->FSP_size;
    int hstk0   = def_offset(16*Isize+12*Tsize  ,Byte,"hstk0");
    int hstk1   = def_offset(16*Isize+2*12*Tsize,Byte,"hstk1");
    int hstk2   = def_offset(16*Isize+3*12*Tsize,Byte,"hstk2");
    int hstk3   = def_offset(16*Isize+4*12*Tsize,Byte,"hstk3");

    int hIsize  = def_offset(Isize,Byte,"Isize");

    int i,co,j,k,nxt,ri,sp,nxtco,eop,eo_a,eo_b;

    /***********************************************************************/

    /*
    * PROLOGUE
    	  */

    make_inst(DIRECTIVE,Enter_Routine,name);

    /*Allocate stack save any callee save registers we need etc...*/
    int stack_buf_size;
    stack_buf_size = 16*Isize +
                     12*Fsize * 5 ;

    hbias = grab_stack(stack_buf_size);
    bias = get_offset(hbias);
    save_regs();
    queue_iadd_imm(mem,PROC->StackPointer,hbias); /*Pointer to buf on stack*/

    /*Define our arguments - all pointers ala fortran*/
    getarg(psi);
    getarg(Umu);
    getarg(Chiin);
    getarg(length);
    /*{... Process arguments ...*/

    queue_iload(length,ZERO_IMM,length);      /*Load in sx counter*/

    retno = get_target_label(); /*Branch to exit if yzt <1*/
    check_iterations(length,retno);

    need_cache_line(0);
    need_cache_line(1);
    need_cache_line(2);
    need_cache_line(3);
    need_cache_line(4);

    pragma(DCBT_SPACE,5);
    pragma(DCBT_POST,1);

#define LOAD_U(comin,comax)\
  /*Load two link rows*/\
  for( i = comin;i<=comax;i++ ){\
    for( ri=0;ri<2;ri++){  \
      queue_fload(UA[i][ri],GIMM[i][0][ri],Umu,GaugeType);\
      queue_fload(UB[i][ri],GIMM[i][1][ri],Umu,GaugeType);\
    } \
  }

#define PRELOAD_U  LOAD_U(0,1)
#define POSTLOAD_U  LOAD_U(2,2)

    PRELOAD_U

#define LOAD_CHI(comin,comax) \
    /*Load Chi column*/\
    for( i = comin;i<=comax;i++ ){\
      for( ri=0;ri<2;ri++){\
        queue_fload(CHIR[i][0][ri],CHI_IMM[i][0][ri],Chiin,TwoSpinType);\
      } \
      for( ri=0;ri<2;ri++){\
        queue_fload(CHIR[i][1][ri],CHI_IMM[i][1][ri],Chiin,TwoSpinType);\
      } \
    }

#define PRELOAD_CHI  LOAD_CHI(0,1)
#define POSTLOAD_CHI  LOAD_CHI(2,2)

#define POSTLOAD \
    POSTLOAD_CHI \
    POSTLOAD_U

    do_prefetch(Chiin,0);
    do_prefetch(Chiin,1);
    if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chiin,2);

    PRELOAD_CHI

    /*
     * Start site loop
     */

    queue_iadd_imm(Chidrain,mem,hbitbucket);

    branchsite = start_loop(length);

    queue_iadd_imm(Chiout,mem,hstk0);

    /*
     * Loop over mu in asm
     */
    queue_iload_imm(mu,Ndimm1);

#define CHIDRAIN \
      queue_fstore(F[0],CHI_IMM[1][1][0],Chidrain,TwoSpinType);\
      queue_fstore(F[1],CHI_IMM[1][1][1],Chidrain,TwoSpinType);\
      queue_fstore(C[0],CHI_IMM[2][0][0],Chidrain,TwoSpinType);\
      queue_fstore(C[1],CHI_IMM[2][0][1],Chidrain,TwoSpinType);\
      queue_fstore(D[0],CHI_IMM[2][1][0],Chidrain,TwoSpinType);\
      queue_fstore(D[1],CHI_IMM[2][1][1],Chidrain,TwoSpinType);


#define PREFETCH_CHI \
  queue_iadd_imm(Chifetch,Chiin,PAD_CHI_ATOM);\
  do_prefetch(Chifetch,0);\
  do_prefetch(Chifetch,1);\
  if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2);

#define PREFETCH_CHIF \
  queue_iadd_imm(Chifetch,Chifetch,PAD_CHI_ATOM);\
  do_prefetch(Chifetch,0);\
  do_prefetch(Chifetch,1);\
  if ( SizeofDatum(TwoSpinType) == 8 ) do_prefetch(Chifetch,2);



    for ( int unroll=0; unroll<2; unroll++ ) {

        if ( unroll==0 ) {
            branchmu = start_loop(mu);
            pragma(DCBT_SPACE,5);
            pragma(STORE_LIM,1);
            pragma(LOAD_LIM,2);
        } else {
            pragma(STORE_LIM,2);
            pragma(DCBT_SPACE,5);
            pragma(DCBT_POST,1);
            pragma(DCBT_PRE,0);
            pragma(LOAD_LIM,2);
        }

        CHIDRAIN
        POSTLOAD

        if ( unroll == 0 ) {
            PREFETCH_CHI
            queue_iadd_imm(Ufetch,Umu,MAT_IMM);
            do_prefetch(Ufetch,0);
            do_prefetch(Ufetch,1);
            do_prefetch(Ufetch,2);
            if ( GaugeType == Double ) {
                do_prefetch(Ufetch,3);
                do_prefetch(Ufetch,4);
            }
        } else {
            pragma(DCBT_SPACE,3);
            PREFETCH_CHI
            PREFETCH_CHIF
            PREFETCH_CHIF
            PREFETCH_CHIF
        }




        j=0;
        queue_three_cmuls(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1],
                          D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1],
                          E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]);
        j=1;
        queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1],
                           D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1],
                           E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]);
        j=2;

        queue_three_cmadds(C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1],
                           D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1],
                           E[0],E[1],UB[j][0],UB[j][1],CHIR[j][0][0],CHIR[j][0][1]);

        /*Store the first three results*/
        queue_fstore(C[0],CHI_IMM[0][0][0],Chiout,TwoSpinType);
        queue_fstore(C[1],CHI_IMM[0][0][1],Chiout,TwoSpinType);
        queue_fstore(D[0],CHI_IMM[0][1][0],Chiout,TwoSpinType);
        queue_fstore(D[1],CHI_IMM[0][1][1],Chiout,TwoSpinType);
        queue_fstore(E[0],CHI_IMM[1][0][0],Chiout,TwoSpinType);
        queue_fstore(E[1],CHI_IMM[1][0][1],Chiout,TwoSpinType);

        /*Load the third row*/
        for(j=0; j<3; j++) {
            for(ri=0; ri<2; ri++) {
                queue_fload(UA[j][ri],GIMM[j][2][ri],Umu,GaugeType);
            }
        }
        /*Gauge layout is linear, mu faster than site*/
        queue_iadd_imm(Umu,Umu,MAT_IMM);


        /*Now the second set of three cdots*/

        j=0;
        queue_three_cmuls(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1],
                          C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1],
                          D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]);
        j=1;
        queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1],
                           C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1],
                           D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]);
        j=2;
        queue_three_cmadds(F[0],F[1],UB[j][0],UB[j][1],CHIR[j][1][0],CHIR[j][1][1],
                           C[0],C[1],UA[j][0],UA[j][1],CHIR[j][0][0],CHIR[j][0][1],
                           D[0],D[1],UA[j][0],UA[j][1],CHIR[j][1][0],CHIR[j][1][1]);

        /**************END SU3 CODE *************/

        queue_iadd_imm(Chiin,Chiin,PAD_CHI_ATOM);
        queue_iadd_imm(Chidrain,Chiout,ZERO_IMM);
        queue_iadd_imm(Chiout,Chiout,CHI_ATOM);

        if ( unroll == 0 ) {

            PRELOAD_U
            PRELOAD_CHI

        }

        /*********************************************************/
        /****************** END OF SU3 MULTIPLY ******************/
        /*********************************************************/

        if ( unroll== 0 ) {
            stop_loop(branchmu,mu); /* End loop over mu*/
            make_inst(DIRECTIVE,Target,get_target_label() ); /*delineate the sections*/
        }
    }


    /*********************************************************/
    /****************** START OF RECONSTRUCT *****************/
    /*********************************************************/

    //Address calculation...
    // Chiminus -> Stack  and  ChiPlus -> Chiin

    pragma(STORE_INORDER,1);
    queue_iadd_imm(Chiminus[0],mem,hstk0);

    /*For register use reasons loop over colour outermost*/

#define LOAD_CHI_MU0(eo,co) \
    for( sp = 0; sp<2;sp++ ){\
      for( ri = 0; ri<2;ri++ ){\
	queue_fload(A[eo][sp][ri],CHI_IMM[co][sp][ri],Chiminus[0],TwoSpinType);\
	if ( co == 0 ) {\
	  queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiin,TwoSpinType);\
	  queue_iadd_imm(Chiplus[0],Chiin,ZERO_IMM);\
	} else {\
	  queue_fload(B[eo][sp][ri],CHI_IMM[co][sp][ri],Chiplus [0],TwoSpinType);\
	}\
      }}


    pragma(LOAD_LIM,2);
    LOAD_CHI_MU0(0,0)
    pragma(DCBT_POST,1);

    CHIDRAIN

    int neo_a = NEO;
    int neo_b = NEO;
    eo_a = 0;
    eo_b = 0;

    for ( co = 0; co <3 ; co ++ ) {

        pragma(LOAD_LIM,1);
        if ( co == 0 ) {
            // Use the third colour for unrolling the loads
            A[1][0][0] = PSI[2][0][0];
            A[1][0][1] = PSI[2][0][1];
            A[1][1][0] = PSI[2][1][0];
            A[1][1][1] = PSI[2][1][1];
            B[1][0][0] = PSI[2][2][0];
            B[1][0][1] = PSI[2][2][1];
            B[1][1][0] = PSI[2][3][0];
            B[1][1][1] = PSI[2][3][1];
            queue_iadd_imm(Chiminus[1],mem,hstk1); // This is invariant of loop
            // Take out
            queue_iadd_imm(Chiplus[1],Chiin     ,PAD_CHI_ATOM);
        }

        /***************************************************************
        * MU = 0 reconstruct                                           *
        ****************************************************************/



        if ( co == 2 ) {
            // Flip to not unrolled due to register pressure
            neo_b = 1;
            neo_a = 2;

            A[1][0][0] = PSI[0][0][0];
            A[1][0][1] = PSI[0][0][1];
            A[1][1][0] = PSI[1][0][0];
            A[1][1][1] = PSI[1][0][1];

            pragma(DCBT_POST,0);
            pragma(DCBT_SPACE,1);
            queue_iadd_imm(Ufetch,Umu,ZERO_IMM);
            //      do_prefetch(Ufetch,0);
            do_prefetch(Ufetch,1);
            do_prefetch(Ufetch,2);
            if ( GaugeType == Double ) {
                do_prefetch(Ufetch,3);
                do_prefetch(Ufetch,4);
            }
        }
        /* psi_0 =  Chiplus[0] + Chiminus[0] */
        /* psi_1 =  Chiplus[1] + Chiminus[1] */

        queue_fadd(PSI[co][0][0],B[eo_b][0][0],A[eo_a][0][0]);
        queue_fadd(PSI[co][0][1],B[eo_b][0][1],A[eo_a][0][1]);
        queue_fadd(PSI[co][1][0],B[eo_b][1][0],A[eo_a][1][0]);
        queue_fadd(PSI[co][1][1],B[eo_b][1][1],A[eo_a][1][1]);

        // Dagger = 0:
        /* psi_2 =-iChiplus[1] +iChiminus[1] */
        /* psi_3 =-iChiplus[0] +iChiminus[0] */
        // Dagger = 1:
        /* psi_2 = iChiplus[1] -iChiminus[1] */
        /* psi_3 = iChiplus[0] -iChiminus[0] */
        if ( dagger == 0 ) {
            queue_fsub(PSI[co][2][0],B[eo_b][1][1],A[eo_a][1][1]);
            queue_fsub(PSI[co][2][1],A[eo_a][1][0],B[eo_b][1][0]);
            queue_fsub(PSI[co][3][0],B[eo_b][0][1],A[eo_a][0][1]);
            queue_fsub(PSI[co][3][1],A[eo_a][0][0],B[eo_b][0][0]);
        } else {
            queue_fsub(PSI[co][2][0],A[eo_a][1][1],B[eo_b][1][1]);
            queue_fsub(PSI[co][2][1],B[eo_b][1][0],A[eo_a][1][0]);
            queue_fsub(PSI[co][3][0],A[eo_a][0][1],B[eo_b][0][1]);
            queue_fsub(PSI[co][3][1],B[eo_b][0][0],A[eo_a][0][0]);
        }

        /***************************************************************
        * MU = 1 reconstruct                                           *
        ****************************************************************/

        eo_a = (eo_a+1)%neo_a;
        eo_b = (eo_b+1)%neo_b;
        for( sp = 0; sp<2; sp++ ) {
            for( ri = 0; ri<2; ri++ ) {

                queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[1],TwoSpinType);
                queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [1],TwoSpinType);

            }
        }

        if ( co == 0 ) {
            queue_iadd_imm(Chiminus[2],mem,hstk2);
            queue_iadd_imm(Chiminus[3],mem,hstk3);
            queue_iadd_imm(Chiplus[2],Chiplus[1],PAD_CHI_ATOM);
            queue_iadd_imm(Chiplus[3],Chiplus[2],PAD_CHI_ATOM);
        }

        /* psi_0 +=  Chiplus[0] + Chiminus[0] */
        /* psi_1 +=  Chiplus[1] + Chiminus[1] */

        queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]);
        queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]);
        queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]);
        queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]);

        queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]);
        queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]);
        queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]);
        queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]);

        //Dagger == 0
        /* psi_2 +=  Chiplus[1] - Chiminus[1] */
        /* psi_3 += -Chiplus[0] + Chiminus[0] */
        //Dagger == 1
        /* psi_2 -=  Chiplus[1] - Chiminus[1] */
        /* psi_3 -= -Chiplus[0] + Chiminus[0] */
        if ( dagger == 0 ) {
            queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]);
            queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]);
            queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]);
            queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]);

            queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]);
            queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]);
            queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]);
            queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]);
        } else {
            queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][1][0]);
            queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][1][1]);
            queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][1][0]);
            queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][1][1]);

            queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][0][0]);
            queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][0][1]);
            queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][0][0]);
            queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][0][1]);
        }

        /***************************************************************
        * MU = 2 reconstruct                                           *
        ****************************************************************/
        eo_a = (eo_a+1)%neo_a;
        eo_b = (eo_b+1)%neo_b;
        for( sp = 0; sp<2; sp++ ) {
            for( ri = 0; ri<2; ri++ ) {

                queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[2],TwoSpinType);
                queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [2],TwoSpinType);

            }
        }

        /* psi_0 +=  Chiplus[0] + Chiminus[0] */
        /* psi_1 +=  Chiplus[1] + Chiminus[1] */

        queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]);
        queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]);
        queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]);
        queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]);

        queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]);
        queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]);
        queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]);
        queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]);

        //Dagger == 0
        /* psi_2 +=-iChiplus[0] +iChiminus[0] */
        /* psi_3 += iChiplus[1] -iChiminus[1] */
        //Dagger == 1

        /* psi_2 -=-iChiplus[0] +iChiminus[0] */
        /* psi_3 -= iChiplus[1] -iChiminus[1] */
        if ( dagger == 0 ) {
            queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]);
            queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]);
            queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]);
            queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]);

            queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]);
            queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]);
            queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]);
            queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]);
        } else {
            queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][1]);
            queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][0]);
            queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][1]);
            queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][0]);

            queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][1]);
            queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][0]);
            queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][1]);
            queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][0]);
        }


        /***************************************************************
        * MU = 3 reconstruct                                           *
        ****************************************************************/
        pragma(LOAD_LIM,2);

        eo_a = (eo_a+1)%neo_a;
        eo_b = (eo_b+1)%neo_b;
        for( sp = 0; sp<2; sp++ ) {
            for( ri = 0; ri<2; ri++ ) {
                queue_fload(A[eo_a][sp][ri],CHI_IMM[co][sp][ri],Chiminus[3],TwoSpinType);
                queue_fload(B[eo_b][sp][ri],CHI_IMM[co][sp][ri],Chiplus [3],TwoSpinType );
            }
        }

        /* psi_0 +=  Chiplus[0] + Chiminus[0] */
        /* psi_1 +=  Chiplus[1] + Chiminus[1] */

        queue_fadd(PSI[co][0][0],PSI[co][0][0],B[eo_b][0][0]);
        queue_fadd(PSI[co][0][1],PSI[co][0][1],B[eo_b][0][1]);
        queue_fadd(PSI[co][1][0],PSI[co][1][0],B[eo_b][1][0]);
        queue_fadd(PSI[co][1][1],PSI[co][1][1],B[eo_b][1][1]);


        //Dagger == 0
        /* psi_2 +=  Chiplus[0] - Chiminus[0] */
        /* psi_3 +=  Chiplus[1] - Chiminus[1] */
        //Dagger == 1
        /* psi_2 -=  Chiplus[0] - Chiminus[0] */
        /* psi_3 -=  Chiplus[1] - Chiminus[1] */
        if ( dagger == 0 ) {
            queue_fadd(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]);
            queue_fadd(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]);
            queue_fadd(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]);
            queue_fadd(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]);
        } else {
            queue_fsub(PSI[co][2][0],PSI[co][2][0],B[eo_b][0][0]);
            queue_fsub(PSI[co][2][1],PSI[co][2][1],B[eo_b][0][1]);
            queue_fsub(PSI[co][3][0],PSI[co][3][0],B[eo_b][1][0]);
            queue_fsub(PSI[co][3][1],PSI[co][3][1],B[eo_b][1][1]);
        }

        queue_fadd(PSI[co][0][0],PSI[co][0][0],A[eo_a][0][0]);
        queue_fadd(PSI[co][0][1],PSI[co][0][1],A[eo_a][0][1]);
        queue_fadd(PSI[co][1][0],PSI[co][1][0],A[eo_a][1][0]);
        queue_fadd(PSI[co][1][1],PSI[co][1][1],A[eo_a][1][1]);

        if ( dagger == 0 ) {
            queue_fsub(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]);
            queue_fsub(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]);
            queue_fsub(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]);
            queue_fsub(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]);
        } else {
            queue_fadd(PSI[co][2][0],PSI[co][2][0],A[eo_a][0][0]);
            queue_fadd(PSI[co][2][1],PSI[co][2][1],A[eo_a][0][1]);
            queue_fadd(PSI[co][3][0],PSI[co][3][0],A[eo_a][1][0]);
            queue_fadd(PSI[co][3][1],PSI[co][3][1],A[eo_a][1][1]);
        }
        /*
         * Store the spinors. If this is problematic
         * in terms of PEC WriteBuf misses, I could
         * store to the stack and copy out later.
         */

        if ( co != 2 ) {
            LOAD_CHI_MU0(0,co+1)
            eo_a=0;
            eo_b=0;
        }

        queue_fstore(PSI[co][0][0],PSI_IMM[0][co][0],psi,FourSpinType);
        queue_fstore(PSI[co][0][1],PSI_IMM[0][co][1],psi,FourSpinType);

    }

    /*
     * Store out in linear order now
     */
    pragma(STORE_LIM,2);
    pragma(DCBT_SPACE,8);

    for ( co=0; co<3; co ++ ) {
        queue_fstore(PSI[co][1][0],PSI_IMM[1][co][0],psi,FourSpinType);
        queue_fstore(PSI[co][1][1],PSI_IMM[1][co][1],psi,FourSpinType);
    }
    for ( co=0; co<3; co ++ ) {
        queue_fstore(PSI[co][2][0],PSI_IMM[2][co][0],psi,FourSpinType);
        queue_fstore(PSI[co][2][1],PSI_IMM[2][co][1],psi,FourSpinType);
    }
    if ( TwoSpinType == FourSpinType ) {
        queue_iadd_imm(Chidrain,psi,CHI_ATOM);
    } else {
        queue_iadd_imm(Chidrain,mem,hbitbucket);
        for ( co=0; co<3; co ++ ) {
            queue_fstore(PSI[co][3][0],PSI_IMM[3][co][0],psi,FourSpinType);
            queue_fstore(PSI[co][3][1],PSI_IMM[3][co][1],psi,FourSpinType);
        }
    }

    queue_iadd_imm(psi,psi,PSI_ATOM);
    /*
     * Put in an artificial dependency here
     * to try to stop the preloads getting above the last load of
     * reconstruct.
     */
    queue_iadd_imm(Chiplus[3],Chiplus[3],ZERO_IMM);
    queue_iadd_imm(Chiin     ,Chiplus[3],PAD_CHI_ATOM);
    pragma(DCBT_SPACE,0);
    do_prefetch(Chiin,0);
    do_prefetch(Chiin,1);
    if ( SizeofDatum(TwoSpinType) == 8 )do_prefetch(Chiin,2);
    PRELOAD_U
    PRELOAD_CHI

    /* TERMINATION point of the loop*/
    stop_loop(branchsite,length);

    CHIDRAIN

    make_inst(DIRECTIVE,Target,retno);

    /*
    *
       * EPILOGUE
       *
       */

    restore_regs();
    free_stack();
    make_inst(DIRECTIVE,Exit_Routine,name);

    return;

}
Example #6
0
void touch(int addr, int line)
{
      do_flush(addr,line);
      if ( dd2 ) l2_touch(addr,line);
      else do_prefetch(addr,line);
}
Example #7
0
void qcdoc_merge( char *name)
{
  int dum = defargcount(5);

  /*Integer register usage*/
  alreg(outptr,Iregs);
  alreg(vec1ptr,Iregs);
  alreg(vec2ptr,Iregs);
  alreg(counter,Iregs);

  /*Floating register usage*/
  reg_array_1d(vec1,Cregs,3);
  reg_array_1d(vec2,Cregs,3);
  reg_array_1d(oreg,Cregs,6);
  alreg(permreg,Cregs);

  def_off(ZERO,SpinorType,0);
  def_off (IN_ATOM,SpinorType,6*nsimd()); // 2spins worth, 3 colors x complex 
  def_off (OUT_ATOM,SpinorType,12*nsimd());// 2spins worth, 3 colors x complex x simd  
  def_off(bits16,Byte,0xFFFF);
  def_off(thirtytwo,Byte,32);
  def_off(sixteen,Byte,16);


  offset_2d(CHI_IMM,SpinorType,6,2*nsimd());

  int Isize = PROC->I_size;
  int word_size = def_offset(Isize,Byte,"word_size");

  struct stream *PreOut;
  struct stream *PreVec1;
  struct stream *PreVec2;

  int brchno,retno; /*Branch target handles*/
  int co;

  make_inst(DIRECTIVE,Enter_Routine,name);
  int bias = grab_stack(64);
  save_regs();
  queue_iadd_imm(PROC->StackPointer,PROC->StackPointer,bias);

  getarg(outptr);           /*Get args*/
  getarg(vec1ptr);  
  getarg(vec2ptr);
  getarg(counter);

  alreg(Mask,Iregs);
  alreg(Convert1,Iregs);
  alreg(Convert2,Iregs);
  int memory = PROC->StackPointer;

  for (int i =0; i<6; i++ ) { 
    need_constant(i*2*SizeofDatum(SpinorType)*nsimd());
  }
  need_constant(64);
  complex_simd_init(permreg);

  if ( half_precision ) {
    queue_iload_imm(Mask,ZERO);
    queue_ori(Mask,Mask,bits16);
    queue_lshift(Mask,Mask,thirtytwo);
    queue_ori(Mask,Mask,bits16);
    queue_lshift(Mask,Mask,sixteen);
  }

  /*
   * Insert a label to prevent reordering
   */
  make_inst(DIRECTIVE,Target,get_target_label());

  PreVec1= create_stream(IN_ATOM,vec1ptr ,counter,STREAM_IN ,LINEAR);
  PreVec2= create_stream(IN_ATOM,vec2ptr ,counter,STREAM_IN ,LINEAR);
  PreOut = create_stream(OUT_ATOM,outptr  ,counter,STREAM_OUT ,LINEAR);

  /*Branch to stack restore if length <1*/
  retno = get_target_label();
  check_iterations(counter,retno); 

  /*
   * Start software pipeline
   */

  brchno = start_loop(counter);

  int indco[3]={0,1,2};
  int permute_mu=3;

  for(int ico=0;ico<3;ico++){
    co = indco[ico];
    // Could do entirely in integer unit for half precision to accelerate this
    if ( half_precision ) { 
      complex_load_half(vec1[co],CHI_IMM[co][0],vec1ptr,memory,Convert1,Convert2,Mask);
      complex_load_half(vec2[co],CHI_IMM[co][0],vec2ptr,memory,Convert1,Convert2,Mask);
    } else { 
      complex_load(vec1[co],CHI_IMM[co][0],vec1ptr,SpinorType);
      complex_load(vec2[co],CHI_IMM[co][0],vec2ptr,SpinorType);
    }
  }

  {
    // Merge the vectors
    for(co=0;co<3;co++) complex_simd_merge (0,permute_mu,oreg[co*2]  ,vec1[co],vec2[co]);
    for(co=0;co<3;co++) complex_simd_merge (1,permute_mu,oreg[co*2+1],vec1[co],vec2[co]);
  }
  //  make_inst(DIRECTIVE,LS_BARRIER);
  for(int i=0;i<6;i++){ // 2 SIMD sites, 3 colors, 2 spins 2 complex == 24 floats
    if ( half_precision ) { 
      complex_store_half(oreg[i],CHI_IMM[i][0],outptr,memory,Convert1,Convert2,Mask);
    } else { 
      complex_store(oreg[i],CHI_IMM[i][0],outptr,SpinorType);
    }
  }

  iterate_stream(PreVec1);
  iterate_stream(PreVec2);

  do_prefetch(vec1ptr,0);
  do_prefetch(vec2ptr,0);
  do_prefetch(vec1ptr,1);
  do_prefetch(vec2ptr,1);


  iterate_stream(PreOut);

  stop_loop(brchno,counter);
  
  make_inst(DIRECTIVE,Target,retno);

  queue_isub_imm(PROC->StackPointer,PROC->StackPointer,bias);
  restore_regs();
  free_stack();
  make_inst(DIRECTIVE,Exit_Routine,name);

  return;
}