void cleanup_gathers(msg_tag *tags1[], msg_tag *tags2[]) { int i; for(i=XUP;i<=TUP;i++){ cleanup_gather( tags1[i] ); cleanup_gather( tags1[OPP_DIR(i)] ); cleanup_gather( tags2[i] ); cleanup_gather( tags2[OPP_DIR(i)] ); } for(i=X3UP;i<=T3UP;i++){ cleanup_gather( tags1[i] ); cleanup_gather( tags1[OPP_3_DIR(i)] ); cleanup_gather( tags2[i] ); cleanup_gather( tags2[OPP_3_DIR(i)] ); } }
static void cleanup_one_gather_set(msg_tag *tags[]) { int i; for(i=XUP;i<=TUP;i++){ cleanup_gather( tags[i] ); cleanup_gather( tags[OPP_DIR(i)] ); } for(i=X3UP;i<=T3UP;i++){ cleanup_gather( tags[i] ); cleanup_gather( tags[OPP_3_DIR(i)] ); } }
void dslash_fn_site( field_offset src, field_offset dest, int parity, fn_links_t *fn ) { register int dir; msg_tag *tag[16]; dslash_fn_site_special(src, dest, parity, tag, 1, fn ); /* free up the buffers */ for(dir=XUP; dir<=TUP; dir++){ cleanup_gather(tag[dir]); cleanup_gather(tag[OPP_DIR(dir)]); } for(dir=X3UP; dir<=T3UP; dir++){ cleanup_gather(tag[dir]); cleanup_gather(tag[OPP_3_DIR(dir)]); } } /* end dslash_fn_site */
void dslash_fn_field( su3_vector *src, su3_vector *dest, int parity, fn_links_t *fn) { register int dir; msg_tag *tag[16]; dslash_fn_field_special(src, dest, parity, tag, 1, fn); /* free up the buffers */ for(dir=XUP; dir<=TUP; dir++){ cleanup_gather(tag[dir]); cleanup_gather(tag[OPP_DIR(dir)]); } for(dir=X3UP; dir<=T3UP; dir++){ cleanup_gather(tag[dir]); cleanup_gather(tag[OPP_3_DIR(dir)]); } }
/* Special dslash_site for use by congrad. Uses restart_gather_site() when possible. Third to last argument is an array of message tags, to be set if this is the first use, otherwise reused. If start=1,use start_gather_site, otherwise use restart_gather_site. The calling program must clean up the gathers! */ void dslash_fn_site_special( field_offset src, field_offset dest, int parity, msg_tag **tag, int start, fn_links_t *fn){ register int i; register site *s; register int dir,otherparity=0; register su3_matrix *fat4; su3_matrix *t_fatlink; #ifndef NO_LONG_LINKS register su3_matrix *long4; su3_matrix *t_longlink; su3_vector *templongvec, *templongv1; #endif su3_vector *tempvec; char myname[] = "dslash_fn_site_special"; if(fn == NULL){ printf("dslash_fn_site_special: invalid fn links!\n"); terminate(1); } #ifndef NO_LONG_LINKS t_longlink = get_lnglinks(fn); #endif t_fatlink = get_fatlinks(fn); tempvec = (su3_vector *) malloc(sizeof(su3_vector)*4*sites_on_node); if(tempvec == NULL){ printf("%s(%d)No room for temporary\n",myname, this_node); terminate(1); } #ifndef NO_LONG_LINKS templongvec = (su3_vector *) malloc(sizeof(su3_vector)*4*sites_on_node); if(templongvec == NULL){ printf("%s(%d)No room for temporary\n",myname, this_node); terminate(1); } templongv1 = create_v_field(); #endif switch(parity){ case EVEN: otherparity=ODD; break; case ODD: otherparity=EVEN; break; case EVENANDODD: otherparity=EVENANDODD; break; } /* Start gathers from positive directions */ for(dir=XUP; dir<=TUP; dir++){ if(start==1) tag[dir] = start_gather_site( src, sizeof(su3_vector), dir, parity, gen_pt[dir] ); else restart_gather_site( src, sizeof(su3_vector), dir, parity, gen_pt[dir] , tag[dir] ); } /* and start the 3rd neighbor gather */ for(dir=X3UP; dir<=T3UP; dir++){ if(start==1) tag[dir] = start_gather_site( src, sizeof(su3_vector), dir, parity, gen_pt[dir] ); else restart_gather_site( src, sizeof(su3_vector), dir, parity, gen_pt[dir] , tag[dir] ); } /* Multiply by adjoint matrix at other sites */ FORSOMEPARITYDOMAIN_OMP(i,s,otherparity,private(fat4,long4)){ if( i < loopend-FETCH_UP ){ fat4 = &(t_fatlink[4*(i+FETCH_UP)]); prefetch_4MV4V( fat4, (su3_vector *)F_PT(s+FETCH_UP,src), tempvec+4*i+FETCH_UP ); #ifndef NO_LONG_LINKS long4 = &(t_longlink[4*(i+FETCH_UP)]); prefetch_4MV4V( long4, (su3_vector *)F_PT(s+FETCH_UP,src), templongvec+4*i+FETCH_UP ); #endif } fat4 = &(t_fatlink[4*i]); #ifndef NO_LONG_LINKS long4 = &(t_longlink[4*i]); #endif mult_adj_su3_mat_vec_4dir( fat4, (su3_vector *)F_PT(s,src), (tempvec+4*i) ); /* multiply by 3-link matrices too */ #ifndef NO_LONG_LINKS mult_adj_su3_mat_vec_4dir( long4, (su3_vector *)F_PT(s,src), (templongvec+4*i) ); #endif } END_LOOP_OMP /* Start gathers from negative directions */ for( dir=XUP; dir <= TUP; dir++){ if (start==1){ /* We need the strided gather so we can pick off one of a group of four vectors in tempvec */ tag[OPP_DIR(dir)] = declare_strided_gather( (char *)(tempvec+dir), 4*sizeof(su3_vector), sizeof(su3_vector), OPP_DIR( dir), parity, gen_pt[OPP_DIR(dir)] ); prepare_gather(tag[OPP_DIR(dir)]); do_gather(tag[OPP_DIR(dir)]); } else { do_gather(tag[OPP_DIR(dir)]); } } #ifndef NO_LONG_LINKS /* and 3rd neighbours */ for( dir=X3UP; dir <= T3UP; dir++){ /**printf("dslash_fn_site_special: down gathers, start=%d\n",start);**/ if (start==1){ tag[OPP_3_DIR(dir)] = declare_strided_gather( (char *)(templongvec+INDEX_3RD(dir)), 4*sizeof(su3_vector), sizeof(su3_vector), OPP_3_DIR(dir), parity, gen_pt[OPP_3_DIR(dir)] ); prepare_gather(tag[OPP_3_DIR(dir)]); do_gather(tag[OPP_3_DIR(dir)]); } else { do_gather(tag[OPP_3_DIR(dir)]); } } #endif /* Wait gathers from positive directions, multiply by matrix and accumulate */ for(dir=XUP; dir<=TUP; dir++){ wait_gather(tag[dir]); } /* wait for the 3-neighbours from positive directions, multiply */ for(dir=X3UP; dir<=T3UP; dir++){ wait_gather(tag[dir]); } FORSOMEPARITYDOMAIN_OMP(i,s,parity, private(fat4,long4) ){ if( i < loopend-FETCH_UP ){ fat4 = &(t_fatlink[4*(i+FETCH_UP)]); prefetch_4MVVVV( fat4, (su3_vector *)gen_pt[XUP][i+FETCH_UP], (su3_vector *)gen_pt[YUP][i+FETCH_UP], (su3_vector *)gen_pt[ZUP][i+FETCH_UP], (su3_vector *)gen_pt[TUP][i+FETCH_UP] ); #ifndef NO_LONG_LINKS long4 = &(t_longlink[4*(i+FETCH_UP)]); prefetch_VV( (su3_vector *)F_PT(s+FETCH_UP,dest), templongv1+i+FETCH_UP); prefetch_4MVVVV( long4, (su3_vector *)gen_pt[X3UP][i+FETCH_UP], (su3_vector *)gen_pt[Y3UP][i+FETCH_UP], (su3_vector *)gen_pt[Z3UP][i+FETCH_UP], (su3_vector *)gen_pt[T3UP][i+FETCH_UP] ); #endif } fat4 = &(t_fatlink[4*i]); mult_su3_mat_vec_sum_4dir( fat4, (su3_vector *)gen_pt[XUP][i], (su3_vector *)gen_pt[YUP][i], (su3_vector *)gen_pt[ZUP][i], (su3_vector *)gen_pt[TUP][i], (su3_vector *)F_PT(s,dest)); #ifndef NO_LONG_LINKS long4 = &(t_longlink[4*i]); mult_su3_mat_vec_sum_4dir( long4, (su3_vector *)gen_pt[X3UP][i], (su3_vector *)gen_pt[Y3UP][i], (su3_vector *)gen_pt[Z3UP][i], (su3_vector *)gen_pt[T3UP][i], templongv1+i); #endif } END_LOOP_OMP /* Wait gathers from negative directions, accumulate (negative) */ for(dir=XUP; dir<=TUP; dir++){ wait_gather(tag[OPP_DIR(dir)]); } /* and the same for the negative 3-rd neighbours */ for(dir=X3UP; dir<=T3UP; dir++){ wait_gather(tag[OPP_3_DIR(dir)]); } FORSOMEPARITYDOMAIN_OMP(i,s,parity, ){ if( i < loopend-FETCH_UP ){ #ifndef NO_LONG_LINKS prefetch_VV( (su3_vector *)F_PT(s+FETCH_UP,dest), templongv1+i+FETCH_UP); #endif prefetch_VVVV( (su3_vector *)gen_pt[XDOWN][i+FETCH_UP], (su3_vector *)gen_pt[YDOWN][i+FETCH_UP], (su3_vector *)gen_pt[ZDOWN][i+FETCH_UP], (su3_vector *)gen_pt[TDOWN][i+FETCH_UP] ); prefetch_VVVV( (su3_vector *)gen_pt[X3DOWN][i+FETCH_UP], (su3_vector *)gen_pt[Y3DOWN][i+FETCH_UP], (su3_vector *)gen_pt[Z3DOWN][i+FETCH_UP], (su3_vector *)gen_pt[T3DOWN][i+FETCH_UP] ); } sub_four_su3_vecs( (su3_vector *)F_PT(s,dest), (su3_vector *)(gen_pt[XDOWN][i]), (su3_vector *)(gen_pt[YDOWN][i]), (su3_vector *)(gen_pt[ZDOWN][i]), (su3_vector *)(gen_pt[TDOWN][i]) ); #ifndef NO_LONG_LINKS sub_four_su3_vecs( templongv1+i, (su3_vector *)(gen_pt[X3DOWN][i]), (su3_vector *)(gen_pt[Y3DOWN][i]), (su3_vector *)(gen_pt[Z3DOWN][i]), (su3_vector *)(gen_pt[T3DOWN][i]) ); /*** Now need to add these things together ***/ add_su3_vector((su3_vector *)F_PT(s,dest), templongv1+i, (su3_vector *)F_PT(s,dest)); #endif } END_LOOP_OMP }
/* Special dslash for use by congrad. Uses restart_gather_field() when possible. Next to last argument is an array of message tags, to be set if this is the first use, otherwise reused. If start=1,use start_gather_field, otherwise use restart_gather_field. The calling program must clean up the gathers and temps! */ void dslash_fn_field_special(su3_vector *src, su3_vector *dest, int parity, msg_tag **tag, int start, fn_links_t *fn){ register int i; register site *s; register int dir,otherparity=0; register su3_matrix *fat4; su3_matrix *t_fatlink; #ifndef NO_LONG_LINKS register su3_matrix *long4; su3_matrix *t_longlink; #endif /* allocate temporary work space only if not already allocated */ if(temp_not_allocated) { for( dir=XUP; dir<=TUP; dir++ ){ temp[dir] =(su3_vector *)malloc(sites_on_node*sizeof(su3_vector)); temp[dir+4]=(su3_vector *)malloc(sites_on_node*sizeof(su3_vector)); } temp[8]=(su3_vector *)malloc(sites_on_node*sizeof(su3_vector)); temp_not_allocated = 0 ; } /* load fatlinks and longlinks */ if(fn == NULL){ printf("dslash_fn_field_special: invalid fn links!\n"); terminate(1); } #ifndef NO_LONG_LINKS t_longlink = get_lnglinks(fn); #endif t_fatlink = get_fatlinks(fn); switch(parity) { case EVEN: otherparity=ODD; break; case ODD: otherparity=EVEN; break; case EVENANDODD: otherparity=EVENANDODD; break; } /* Start gathers from positive directions */ /* And start the 3-step gather too */ for( dir=XUP; dir<=TUP; dir++ ){ if(start==1) { tag[dir] = start_gather_field( src, sizeof(su3_vector), dir, parity,gen_pt[dir] ); #ifndef NO_LONG_LINKS tag[DIR3(dir)] = start_gather_field(src, sizeof(su3_vector), DIR3(dir),parity, gen_pt[DIR3(dir)] ); #endif } else { restart_gather_field( src, sizeof(su3_vector), dir, parity,gen_pt[dir], tag[dir]); #ifndef NO_LONG_LINKS restart_gather_field(src, sizeof(su3_vector), DIR3(dir), parity, gen_pt[DIR3(dir)], tag[DIR3(dir)]); #endif } } /* Multiply by adjoint matrix at other sites */ /* Use fat link for single link transport */ FORSOMEPARITYDOMAIN_OMP( i, s, otherparity, private(fat4,long4) ){ //NOPRE if( i < loopend-FETCH_UP ){ //NOPRE fat4 = &(t_fatlink[4*(i+FETCH_UP)]); //NOPRE prefetch_V(&(src[i+FETCH_UP])); //NOPRE prefetch_4MVVVV( //NOPRE fat4, //NOPRE &(temp[0][i+FETCH_UP]), //NOPRE &(temp[1][i+FETCH_UP]), //NOPRE &(temp[2][i+FETCH_UP]), //NOPRE &(temp[3][i+FETCH_UP]) ); #ifndef NO_LONG_LINKS //NOPRE long4 = &(t_longlink[4*(i+FETCH_UP)]); //NOPRE prefetch_4MVVVV( //NOPRE long4, //NOPRE &(temp[4][i+FETCH_UP]), //NOPRE &(temp[5][i+FETCH_UP]), //NOPRE &(temp[6][i+FETCH_UP]), //NOPRE &(temp[7][i+FETCH_UP]) ); #endif //NOPRE } fat4 = &(t_fatlink[4*i]); mult_adj_su3_mat_4vec( fat4, &(src[i]), &(temp[0][i]), &(temp[1][i]), &(temp[2][i]), &(temp[3][i]) ); #ifndef NO_LONG_LINKS /* multiply by 3-link matrices too */ long4 = &(t_longlink[4*i]); mult_adj_su3_mat_4vec( long4, &(src[i]),&(temp[4][i]), &(temp[5][i]), &(temp[6][i]), &(temp[7][i]) ); #endif } END_LOOP_OMP /* Start gathers from negative directions */ for( dir=XUP; dir <= TUP; dir++){ if (start==1) tag[OPP_DIR(dir)] = start_gather_field( temp[dir], sizeof(su3_vector), OPP_DIR( dir), parity, gen_pt[OPP_DIR(dir)] ); else restart_gather_field( temp[dir], sizeof(su3_vector), OPP_DIR( dir), parity, gen_pt[OPP_DIR(dir)], tag[OPP_DIR(dir)] ); } /* Start 3-neighbour gathers from negative directions */ for( dir=X3UP; dir <= T3UP; dir++){ if (start==1) tag[OPP_3_DIR(dir)]=start_gather_field( temp[INDEX_3RD(dir)+4], sizeof(su3_vector), OPP_3_DIR( dir), parity, gen_pt[OPP_3_DIR(dir)] ); else restart_gather_field(temp[INDEX_3RD(dir)+4], sizeof(su3_vector), OPP_3_DIR( dir),parity, gen_pt[OPP_3_DIR(dir)], tag[OPP_3_DIR(dir)] ); } /* Wait gathers from positive directions, multiply by matrix and accumulate */ /* wait for the 3-neighbours from positive directions, multiply */ for(dir=XUP; dir<=TUP; dir++){ wait_gather(tag[dir]); #ifndef NO_LONG_LINKS wait_gather(tag[DIR3(dir)]); #endif } FORSOMEPARITYDOMAIN_OMP(i,s,parity, private(fat4,long4) ){ //NOPRE if( i < loopend-FETCH_UP ){ //NOPRE fat4 = &(t_fatlink[4*(i+FETCH_UP)]); //NOPRE prefetch_4MVVVV( //NOPRE fat4, //NOPRE (su3_vector *)gen_pt[XUP][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[YUP][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[ZUP][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[TUP][i+FETCH_UP] ); //NOPRE prefetch_VVVV( //NOPRE (su3_vector *)gen_pt[XDOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[YDOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[ZDOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[TDOWN][i+FETCH_UP] ); #ifndef NO_LONG_LINKS //NOPRE long4 = &(t_longlink[4*(i+FETCH_UP)]); //NOPRE prefetch_4MVVVV( //NOPRE long4, //NOPRE (su3_vector *)gen_pt[X3UP][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[Y3UP][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[Z3UP][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[T3UP][i+FETCH_UP] ); //NOPRE prefetch_VVVV( //NOPRE (su3_vector *)gen_pt[X3DOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[Y3DOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[Z3DOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[T3DOWN][i+FETCH_UP] ); #endif //NOPRE } fat4 = &(t_fatlink[4*i]); mult_su3_mat_vec_sum_4dir( fat4, (su3_vector *)gen_pt[XUP][i], (su3_vector *)gen_pt[YUP][i], (su3_vector *)gen_pt[ZUP][i], (su3_vector *)gen_pt[TUP][i], &(dest[i]) ); #ifndef NO_LONG_LINKS long4 = &(t_longlink[4*i]); mult_su3_mat_vec_sum_4dir( long4, (su3_vector *)gen_pt[X3UP][i], (su3_vector *)gen_pt[Y3UP][i], (su3_vector *)gen_pt[Z3UP][i], (su3_vector *)gen_pt[T3UP][i], &(temp[8][i])); #endif } END_LOOP_OMP /* Wait gathers from negative directions, accumulate (negative) */ /* and the same for the negative 3-rd neighbours */ for(dir=XUP; dir<=TUP; dir++){ wait_gather(tag[OPP_DIR(dir)]); } for(dir=X3UP; dir<=T3UP; dir++){ wait_gather(tag[OPP_3_DIR(dir)]); } FORSOMEPARITYDOMAIN_OMP(i,s,parity, ){ //NOPRE if( i < loopend-FETCH_UP ){ //NOPRE prefetch_VVVVV( //NOPRE &(dest[i+FETCH_UP]), //NOPRE (su3_vector *)gen_pt[XDOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[YDOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[ZDOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[TDOWN][i+FETCH_UP] ); //NOPRE prefetch_VVVVV( //NOPRE &(temp[8][i+FETCH_UP]), //NOPRE (su3_vector *)gen_pt[X3DOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[Y3DOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[Z3DOWN][i+FETCH_UP], //NOPRE (su3_vector *)gen_pt[T3DOWN][i+FETCH_UP] ); //NOPRE } sub_four_su3_vecs( &(dest[i]), (su3_vector *)(gen_pt[XDOWN][i]), (su3_vector *)(gen_pt[YDOWN][i]), (su3_vector *)(gen_pt[ZDOWN][i]), (su3_vector *)(gen_pt[TDOWN][i]) ); sub_four_su3_vecs( &(temp[8][i]), (su3_vector *)(gen_pt[X3DOWN][i]), (su3_vector *)(gen_pt[Y3DOWN][i]), (su3_vector *)(gen_pt[Z3DOWN][i]), (su3_vector *)(gen_pt[T3DOWN][i]) ); /* Now need to add these things together */ add_su3_vector(&(dest[i]), &(temp[8][i]),&(dest[i])); } END_LOOP_OMP }
/* D_slash routine - sets dest. on each site equal to sum of sources parallel transported to site, with minus sign for transport from negative directions. Use "fatlinks" for one link transport, "longlinks" for three link transport. */ void dslash_fn( field_offset src, field_offset dest, int parity ) { register int i; register site *s; register int dir,otherparity; register su3_matrix *fat4, *long4; msg_tag *tag[16]; if(!valid_longlinks)load_longlinks(); if(!valid_fatlinks)load_fatlinks(); switch(parity){ case EVEN: otherparity=ODD; break; case ODD: otherparity=EVEN; break; case EVENANDODD: otherparity=EVENANDODD; break; } /* Start gathers from positive directions */ /* And start the 3-step gather too */ for( dir=XUP; dir<=TUP; dir++ ){ tag[dir] = start_gather( src, sizeof(su3_vector), dir, parity, gen_pt[dir] ); tag[DIR3(dir)] = start_gather( src, sizeof(su3_vector), DIR3(dir), parity, gen_pt[DIR3(dir)] ); } /* Multiply by adjoint matrix at other sites */ /* Use fat link for single link transport */ FORSOMEPARITY( i, s, otherparity ){ if( i < loopend-FETCH_UP ){ #ifdef DSLASH_TMP_LINKS fat4 = &(t_fatlink[4*(i+FETCH_UP)]); long4 = &(t_longlink[4*(i+FETCH_UP)]); #else fat4 = (s+FETCH_UP)->fatlink; long4 = (s+FETCH_UP)->longlink; #endif prefetch_4MV4V( fat4, (su3_vector *)F_PT(s+FETCH_UP,src), (s+FETCH_UP)->tempvec ); prefetch_4MV4V( long4, (su3_vector *)F_PT(s+FETCH_UP,src), (s+FETCH_UP)->templongvec ); } #ifdef DSLASH_TMP_LINKS fat4 = &(t_fatlink[4*i]); long4 = &(t_longlink[4*i]); #else fat4 = s->fatlink; long4 = s->longlink; #endif mult_adj_su3_mat_vec_4dir( fat4, (su3_vector *)F_PT(s,src), s->tempvec ); /* multiply by 3-link matrices too */ mult_adj_su3_mat_vec_4dir( long4, (su3_vector *)F_PT(s,src), s->templongvec ); } END_LOOP /* Start gathers from negative directions */ for( dir=XUP; dir <= TUP; dir++){ tag[OPP_DIR(dir)] = start_gather( F_OFFSET(tempvec[dir]), sizeof(su3_vector), OPP_DIR( dir), parity, gen_pt[OPP_DIR(dir)] ); } /* Start 3-neighbour gathers from negative directions */ for( dir=X3UP; dir <= T3UP; dir++){ tag[OPP_3_DIR(dir)] = start_gather( F_OFFSET(templongvec[INDEX_3RD(dir)]), sizeof(su3_vector), OPP_3_DIR( dir), parity, gen_pt[OPP_3_DIR(dir)] ); } /* Wait gathers from positive directions, multiply by matrix and accumulate */ /* wait for the 3-neighbours from positive directions, multiply */ for(dir=XUP; dir<=TUP; dir++){ wait_gather(tag[dir]); wait_gather(tag[DIR3(dir)]); } /* Wait gathers from negative directions, accumulate (negative) */ /* and the same for the negative 3-rd neighbours */ for(dir=XUP; dir<=TUP; dir++){ wait_gather(tag[OPP_DIR(dir)]); } for(dir=X3UP; dir<=T3UP; dir++){ wait_gather(tag[OPP_3_DIR(dir)]); } FORSOMEPARITY(i,s,parity){ #ifdef DSLASH_TMP_LINKS fat4 = &(t_fatlink[4*i]); long4 = &(t_longlink[4*i]); #else fat4 = s->fatlink; long4 = s->longlink; #endif mult_su3_mat_vec_sum_4dir( fat4, (su3_vector *)gen_pt[XUP][i], (su3_vector *)gen_pt[YUP][i], (su3_vector *)gen_pt[ZUP][i], (su3_vector *)gen_pt[TUP][i], (su3_vector *)F_PT(s,dest)); mult_su3_mat_vec_sum_4dir( long4, (su3_vector *)gen_pt[X3UP][i], (su3_vector *)gen_pt[Y3UP][i], (su3_vector *)gen_pt[Z3UP][i], (su3_vector *)gen_pt[T3UP][i], (su3_vector *) &(s->templongv1)); if( i < loopend-FETCH_UP ){ #ifdef DSLASH_TMP_LINKS fat4 = &(t_fatlink[4*(i+FETCH_UP)]); long4 = &(t_longlink[4*(i+FETCH_UP)]); #else fat4 = (s+FETCH_UP)->fatlink; long4 = (s+FETCH_UP)->longlink; #endif prefetch_4MVVVV( fat4, (su3_vector *)gen_pt[XUP][i+FETCH_UP], (su3_vector *)gen_pt[YUP][i+FETCH_UP], (su3_vector *)gen_pt[ZUP][i+FETCH_UP], (su3_vector *)gen_pt[TUP][i+FETCH_UP] ); prefetch_4MVVVV( long4, (su3_vector *)gen_pt[X3UP][i+FETCH_UP], (su3_vector *)gen_pt[Y3UP][i+FETCH_UP], (su3_vector *)gen_pt[Z3UP][i+FETCH_UP], (su3_vector *)gen_pt[T3UP][i+FETCH_UP] ); prefetch_VVVV( (su3_vector *)gen_pt[XDOWN][i+FETCH_UP], (su3_vector *)gen_pt[YDOWN][i+FETCH_UP], (su3_vector *)gen_pt[ZDOWN][i+FETCH_UP], (su3_vector *)gen_pt[TDOWN][i+FETCH_UP] ); prefetch_VVVV( (su3_vector *)gen_pt[X3DOWN][i+FETCH_UP], (su3_vector *)gen_pt[Y3DOWN][i+FETCH_UP], (su3_vector *)gen_pt[Z3DOWN][i+FETCH_UP], (su3_vector *)gen_pt[T3DOWN][i+FETCH_UP] ); } sub_four_su3_vecs( (su3_vector *)F_PT(s,dest), (su3_vector *)(gen_pt[XDOWN][i]), (su3_vector *)(gen_pt[YDOWN][i]), (su3_vector *)(gen_pt[ZDOWN][i]), (su3_vector *)(gen_pt[TDOWN][i]) ); sub_four_su3_vecs( &(s->templongv1), (su3_vector *)(gen_pt[X3DOWN][i]), (su3_vector *)(gen_pt[Y3DOWN][i]), (su3_vector *)(gen_pt[Z3DOWN][i]), (su3_vector *)(gen_pt[T3DOWN][i]) ); /* Now need to add these things together */ add_su3_vector((su3_vector *)F_PT(s,dest), & (s->templongv1), (su3_vector *)F_PT(s,dest)); } END_LOOP
/* Smearing level 0 */ static void QOP_hisq_force_multi_smearing0_fnmat(QOP_info_t *info, REAL *residues, QDP_ColorVector *x[], int nterms, QDP_ColorMatrix *force_accum[4], QDP_ColorMatrix *force_accum_naik[4]) { int term; int i,k; int dir; REAL coeff; QDP_ColorMatrix *tmat; QDP_ColorMatrix *oprod_along_path[MAX_PATH_LENGTH+1]; QDP_ColorMatrix *mat_tmp0; QDP_ColorVector *tsrc[2], *vec_tmp[2]; size_t nflops = 0; if( nterms==0 )return; mat_tmp0 = QDP_create_M(); tmat = QDP_create_M(); tsrc[0] = QDP_create_V(); tsrc[1] = QDP_create_V(); vec_tmp[0] = QDP_create_V(); vec_tmp[1] = QDP_create_V(); for(i=0;i<=MAX_PATH_LENGTH;i++){ oprod_along_path[i] = QDP_create_M(); } // clear force accumulators for(dir=XUP;dir<=TUP;dir++) QDP_M_eq_zero(force_accum[dir], QDP_all); for(dir=XUP;dir<=TUP;dir++){ //AB loop on directions, path table is not needed k=0; // which vec_tmp we are using (0 or 1) QDP_V_eq_V(tsrc[k], x[0], QDP_all); QDP_V_eq_sV(vec_tmp[k], tsrc[k], fnshift(OPP_DIR(dir)), fndir(OPP_DIR(dir)), QDP_all); QDP_M_eq_zero(oprod_along_path[0], QDP_all); for(term=0;term<nterms;term++){ if(term<nterms-1) { QDP_V_eq_V(tsrc[1-k], x[term+1], QDP_all); QDP_V_eq_sV(vec_tmp[1-k], tsrc[1-k], fnshift(OPP_DIR(dir)), fndir(OPP_DIR(dir)), QDP_all); } //QDP_M_eq_V_times_Va(tmat, x[term], vec_tmp[k], QDP_all); QDP_M_eq_V_times_Va(tmat, tsrc[k], vec_tmp[k], QDP_all); nflops += 54; QDP_discard_V(vec_tmp[k]); QDP_M_peq_r_times_M(oprod_along_path[0], &residues[term], tmat, QDP_all); nflops += 36; k=1-k; // swap 0 and 1 } // end loop over terms in rational function expansion link_gather_connection_qdp(oprod_along_path[1], oprod_along_path[0], tmat, dir ); coeff = 1.; QDP_M_peq_r_times_M(force_accum[dir],&coeff,oprod_along_path[1],QDP_all); nflops += 36; } // end of loop on directions // // *** Naik part *** / // clear force accumulators for(dir=XUP;dir<=TUP;dir++) QDP_M_eq_zero(force_accum_naik[dir], QDP_all); for(dir=XUP;dir<=TUP;dir++){ //AB loop on directions, path table is not needed k=0; // which vec_tmp we are using (0 or 1) QDP_V_eq_V(tsrc[k], x[0], QDP_all); QDP_V_eq_sV(vec_tmp[k], tsrc[k], fnshift(OPP_3_DIR( DIR3(dir) )), fndir(OPP_3_DIR( DIR3(dir) )), QDP_all); QDP_M_eq_zero(oprod_along_path[0], QDP_all); for(term=0;term<nterms;term++){ if(term<nterms-1) { QDP_V_eq_V(tsrc[1-k], x[term+1], QDP_all); QDP_V_eq_sV(vec_tmp[1-k], tsrc[1-k], fnshift(OPP_3_DIR( DIR3(dir) )), fndir(OPP_3_DIR( DIR3(dir) )), QDP_all); } //QDP_M_eq_V_times_Va(tmat, x[term], vec_tmp[k], QDP_all); QDP_M_eq_V_times_Va(tmat, tsrc[k], vec_tmp[k], QDP_all); nflops += 54; QDP_discard_V(vec_tmp[k]); QDP_M_peq_r_times_M(oprod_along_path[0], &residues[term], tmat, QDP_all); nflops += 36; k=1-k; // swap 0 and 1 } // end loop over terms in rational function expansion link_gather_connection_qdp(oprod_along_path[1], oprod_along_path[0], tmat, DIR3(dir) ); coeff = 1; // fermion_eps is outside this routine in "wrapper" routine QDP_M_peq_r_times_M(force_accum_naik[dir],&coeff, oprod_along_path[1],QDP_all); nflops += 36; } // end of loop on directions QDP_destroy_V( tsrc[0] ); QDP_destroy_V( tsrc[1] ); QDP_destroy_V( vec_tmp[0] ); QDP_destroy_V( vec_tmp[1] ); QDP_destroy_M( mat_tmp0 ); QDP_destroy_M( tmat ); for(i=0;i<=MAX_PATH_LENGTH;i++){ QDP_destroy_M( oprod_along_path[i] ); } info->final_flop = ((double)nflops)*QDP_sites_on_node; return; } //hisq_force_multi_smearing0_fnmat