// // Unsweep for a sight map // static void Unsweep(Map *map) { START(unSweepTime); PREFETCH(&__prefetch); // Last sweep position S32 tileX = map->lastX; S32 tileZ = map->lastZ; // Last sweep radius S32 r = map->lastR; // Get viewing mask for lower layer U8 *mapLo = map->GetByteMap(Map::LV_LO); U8 maskLo = map->GetBitMask(Map::LV_LO); // Dirty cells that line of sight has changed in DirtyCells(tileX - r, tileZ - r, tileX + r, tileZ + r, map->lastTeam); // iterate over all tiles within last scan radius S32 first = XZToSeemap(-r, -r); for (S32 y = -r; y <= r; y++, first += MAPSIDE) { PREFETCH(&mapLo[first+MAPSIDE]); for (S32 x = -r, index = first; x <= r; x++, index++) { // Unsweep tile on ground level if was swept if (mapLo[index] & maskLo) { // Unsweep for all teams last swept CantSee(tileX + x, tileZ + y, x, y, map->lastTeam, Map::LV_LO); // Clear seen bit mapLo[index] &= ~maskLo; } } } // Reset last radius and team mask map->lastR = 0; map->lastTeam = 0; map->lastAlt = F32_MAX; STOP(unSweepTime); }
static void TestAtomic64(void) { uint64 z64, x64; z64 = 42; x64 = 0; PREFETCH(&z64); if(runtime_cas64(&z64, x64, 1)) runtime_throw("cas64 failed"); if(x64 != 0) runtime_throw("cas64 failed"); x64 = 42; if(!runtime_cas64(&z64, x64, 1)) runtime_throw("cas64 failed"); if(x64 != 42 || z64 != 1) runtime_throw("cas64 failed"); if(runtime_atomicload64(&z64) != 1) runtime_throw("load64 failed"); runtime_atomicstore64(&z64, (1ull<<40)+1); if(runtime_atomicload64(&z64) != (1ull<<40)+1) runtime_throw("store64 failed"); if(runtime_xadd64(&z64, (1ull<<40)+1) != (2ull<<40)+2) runtime_throw("xadd64 failed"); if(runtime_atomicload64(&z64) != (2ull<<40)+2) runtime_throw("xadd64 failed"); if(runtime_xchg64(&z64, (3ull<<40)+3) != (2ull<<40)+2) runtime_throw("xchg64 failed"); if(runtime_atomicload64(&z64) != (3ull<<40)+3) runtime_throw("xchg64 failed"); }
static void trace_object(Collector *collector, REF *p_ref) { forward_object(collector, p_ref); Vector_Block* trace_stack = (Vector_Block*)collector->trace_stack; while( !vector_stack_is_empty(trace_stack)){ p_ref = (REF *)vector_stack_pop(trace_stack); #ifdef PREFETCH_SUPPORTED /* DO PREFETCH */ if(mark_prefetch) { if(!vector_stack_is_empty(trace_stack)) { REF *pref = (REF*)vector_stack_read(trace_stack, 0); PREFETCH( read_slot(pref) ); } } #endif forward_object(collector, p_ref); trace_stack = (Vector_Block*)collector->trace_stack; } return; }
void gravity_kernel2n(int nj, pPrdPosVel posvel, pNewAccJrk accjerk, int i, int ithread) { int ret; int j; double true_rmin2; float hinv0, hinv1; pPred_Mem jptr = pred_mem; pIparticle iptr; pNeighbourList nbptr, nbptr0 = neighbour[ithread]; float ten = 10.0, minusone = -1.0; if(posvel[0].h2 == 0.0) hinv0 = - 1e10; else hinv0 = - 2.0 / sqrt(posvel[0].h2); if(posvel[1].h2 == 0.0) hinv1 = - 1e10; else hinv1 = - 2.0 / sqrt(posvel[1].h2); ret = posix_memalign((void **)&iptr, 32, NVAR_IP * 32); assert(ret == 0); VBROADCASTSD(posvel[0].xpos, YMM00); VBROADCASTSD(posvel[0].ypos, YMM01); VBROADCASTSD(posvel[0].zpos, YMM02); VBROADCASTSD(posvel[1].xpos, YMM03); VBROADCASTSD(posvel[1].ypos, YMM04); VBROADCASTSD(posvel[1].zpos, YMM05); VBROADCASTSS(posvel[0].xvel, XMM06); VBROADCASTSS(posvel[1].xvel, XMM07); VMERGE(YMM06, YMM07, YMM06); VBROADCASTSS(posvel[0].yvel, XMM08); VBROADCASTSS(posvel[1].yvel, XMM09); VMERGE(YMM08, YMM09, YMM07); VBROADCASTSS(posvel[0].zvel, XMM10); VBROADCASTSS(posvel[1].zvel, XMM11); VMERGE(YMM10, YMM11, YMM08); VBROADCASTSS(posvel[0].id, XMM12); VBROADCASTSS(posvel[1].id, XMM13); VMERGE(YMM12, YMM13, YMM09); VBROADCASTSS(posvel[0].eps2, XMM14); VBROADCASTSS(posvel[1].eps2, XMM15); VMERGE(YMM14, YMM15, YMM10); VBROADCASTSS(hinv0, XMM11); VBROADCASTSS(hinv1, XMM12); VMERGE(YMM11, YMM12, YMM11); VBROADCASTSS(ten, YMM12); VBROADCASTSS(minusone, YMM13); VSTORPD(YMM00, iptr->xpos0[0]); VSTORPD(YMM01, iptr->ypos0[0]); VSTORPD(YMM02, iptr->zpos0[0]); VSTORPD(YMM03, iptr->xpos1[0]); VSTORPD(YMM04, iptr->ypos1[0]); VSTORPD(YMM05, iptr->zpos1[0]); VSTORPS(YMM06, iptr->xvel01[0]); VSTORPS(YMM07, iptr->yvel01[0]); VSTORPS(YMM08, iptr->zvel01[0]); VSTORPS(YMM09, iptr->id01[0]); VSTORPS(YMM10, iptr->veps2[0]); VSTORPS(YMM11, iptr->hinv[0]); VSTORPS(YMM12, iptr->rmin2[0]); VSTORPS(YMM13, iptr->in[0]); VZEROALL; for(j = 0, nbptr = nbptr0; j < nj; j += JPARA, jptr++, nbptr++){ // if nj % 2 != 0 ATARU // dx -> YMM03 VLOADPD(jptr->xpos[0], YMM00); VSUBPD_M(iptr->xpos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->xpos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM03); // dy -> YMM04 VLOADPD(jptr->ypos[0], YMM00); VSUBPD_M(iptr->ypos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->ypos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM04); // dz -> YMM05 VLOADPD(jptr->zpos[0], YMM00); VSUBPD_M(iptr->zpos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->zpos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM05); // dr^2 VLOADPS(iptr->veps2[0], YMM01); VFMADDPS(YMM01, YMM03, YMM03); VFMADDPS(YMM01, YMM04, YMM04); VFMADDPS(YMM01, YMM05, YMM05); // - 2 / r -> YMM01 VRSQRTPS(YMM01, YMM02); VMULPS(YMM02, YMM01, YMM01); VFMSUB213PS_M(three[0], YMM02, YMM01); VMULPS(YMM02, YMM01, YMM01); // mask VLOADPS(jptr->indx[0], YMM02); VLOADPS(iptr->id01[0], YMM00); VCMPNEQPS(YMM00, YMM02, YMM02); VANDPS(YMM02, YMM01, YMM01); // nearest neighbour (free: YMM00, YMM02, YMM06, YMM07, YMM08) VLOADPS(iptr->rmin2[0], YMM00); VMINPS(YMM01, YMM00, YMM02); VSTORPS(YMM02, iptr->rmin2[0]); VCMPPS(YMM01, YMM00, YMM02, GT); VLOADPS(jptr->indx[0], YMM06); VANDPS(YMM02, YMM06, YMM07); VCMPPS(YMM01, YMM00, YMM08, LE); VANDPS_M(iptr->in[0], YMM08, YMM08); VADDPS(YMM08, YMM07, YMM07); VSTORPS(YMM07, iptr->in[0]); // neighbour list VLOADPS(iptr->hinv[0], YMM00); VCMPPS(YMM00, YMM01, YMM00, LE); VLOADPS(flag[0], YMM02); VANDPS(YMM02, YMM00, YMM00); VSTORPS(YMM00, nbptr->flag[0]); // potential VMULPS_M(jptr->mass[0], YMM01, YMM02); VCVTPS2PD(XMM02, YMM00); VUP2LOW(YMM02, XMM06); VCVTPS2PD(XMM06, YMM06); VHADDPD(YMM06, YMM00, YMM07); VADDPD(YMM07, YMM09, YMM09); // dvx, dvy, dvz (vj - vi) VLOADPS(jptr->xvel[0], YMM06); VSUBPS_M(iptr->xvel01[0], YMM06, YMM06); VLOADPS(jptr->yvel[0], YMM07); VSUBPS_M(iptr->yvel01[0], YMM07, YMM07); VLOADPS(jptr->zvel[0], YMM08); VSUBPS_M(iptr->zvel01[0], YMM08, YMM08); // xv -> YMM00 VMULPS(YMM03, YMM06, YMM00); VFMADDPS(YMM00, YMM04, YMM07); VFMADDPS(YMM00, YMM05, YMM08); // YMM00: 3.0 * xv / r^2, YMM02: - m / r^3 VMULPS_M(jptr->mass[0], YMM01, YMM02); VMULPS(YMM01, YMM01, YMM01); VMULPS(YMM01, YMM00, YMM00); VMULPS(YMM01, YMM02, YMM02); VMULPS_M(threefourth[0], YMM00, YMM00); // prefetch PREFETCH((jptr+1)->xpos[0]); PREFETCH((jptr+1)->zpos[0]); PREFETCH((jptr+1)->mass[0]); PREFETCH((jptr+1)->yvel[0]); // jx1, jy1, jz1 VFMADDPS(YMM13, YMM02, YMM06); VFMADDPS(YMM14, YMM02, YMM07); VFMADDPS(YMM15, YMM02, YMM08); // ax VMULPS(YMM02, YMM03, YMM03); VCVTPS2PD(XMM03, YMM06); VUP2LOW(YMM03, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM10, YMM10); // ay VMULPS(YMM02, YMM04, YMM04); VCVTPS2PD(XMM04, YMM06); VUP2LOW(YMM04, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM11, YMM11); // az VMULPS(YMM02, YMM05, YMM05); VCVTPS2PD(XMM05, YMM06); VUP2LOW(YMM05, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM12, YMM12); // jx2, jy2, jz2 VFNMADDPS(YMM13, YMM00, YMM03); VFNMADDPS(YMM14, YMM00, YMM04); VFNMADDPS(YMM15, YMM00, YMM05); } VSTORPD(YMM09, iptr->pot[0]); VSTORPD(YMM10, iptr->xacc[0]); VSTORPD(YMM11, iptr->yacc[0]); VSTORPD(YMM12, iptr->zacc[0]); VSTORPS(YMM13, iptr->xjrk[0]); VSTORPS(YMM14, iptr->yjrk[0]); VSTORPS(YMM15, iptr->zjrk[0]); accjerk[0].xacc = iptr->xacc[0] + iptr->xacc[2]; accjerk[0].yacc = iptr->yacc[0] + iptr->yacc[2]; accjerk[0].zacc = iptr->zacc[0] + iptr->zacc[2]; accjerk[0].pot = iptr->pot[0] + iptr->pot[2]; accjerk[0].xjrk = iptr->xjrk[0] + iptr->xjrk[1] + iptr->xjrk[2] + iptr->xjrk[3]; accjerk[0].yjrk = iptr->yjrk[0] + iptr->yjrk[1] + iptr->yjrk[2] + iptr->yjrk[3]; accjerk[0].zjrk = iptr->zjrk[0] + iptr->zjrk[1] + iptr->zjrk[2] + iptr->zjrk[3]; for(true_rmin2 = 1e30, j = 0; j < JPARA; j++){ if(iptr->rmin2[j] < true_rmin2){ true_rmin2 = iptr->rmin2[j]; accjerk[0].rnnb = - 2.0 / true_rmin2; accjerk[0].nnb = (int)iptr->in[j]; } } accjerk[1].xacc = iptr->xacc[1] + iptr->xacc[3]; accjerk[1].yacc = iptr->yacc[1] + iptr->yacc[3]; accjerk[1].zacc = iptr->zacc[1] + iptr->zacc[3]; accjerk[1].pot = iptr->pot[1] + iptr->pot[3]; accjerk[1].xjrk = iptr->xjrk[4] + iptr->xjrk[5] + iptr->xjrk[6] + iptr->xjrk[7]; accjerk[1].yjrk = iptr->yjrk[4] + iptr->yjrk[5] + iptr->yjrk[6] + iptr->yjrk[7]; accjerk[1].zjrk = iptr->zjrk[4] + iptr->zjrk[5] + iptr->zjrk[6] + iptr->zjrk[7]; for(true_rmin2 = 1e30, j = 4; j < 4 + JPARA; j++){ if(iptr->rmin2[j] < true_rmin2){ true_rmin2 = iptr->rmin2[j]; accjerk[1].rnnb = - 2.0 / true_rmin2; accjerk[1].nnb = (int)iptr->in[j]; } } int jj; int nn0, nn1; for(nn0 = nn1 = 0, j = 0, jptr = pred_mem, nbptr = nbptr0; j < nj; j += JPARA, jptr++, nbptr++){ for(jj = 0; jj < JPARA; jj++) if(nbptr->flag[jj] == 1.0){ nbl[i][nn0] = (int)jptr->indx[jj]; ++nn0; } for(jj = 4; jj < JPARA + 4; jj++) if(nbptr->flag[jj] == 1.0){ nbl[i+1][nn1] = (int)jptr->indx[jj]; ++nn1; } } if(nn0 > MAXLEN || nn1 > MAXLEN) nblerror = 1; nblen[i] = nn0; nblen[i+1] = nn1; free(iptr); return; }
void gravity_kernel(int nj, pPrdPosVel posvel, pNewAccJrk accjerk) { int ret; int j; pPred_Mem jptr = pred_mem; pIparticle iptr; ret = posix_memalign((void **)&iptr, 32, NVAR_IP * 32); assert(ret == 0); VBROADCASTSD(posvel[0].xpos, YMM00); VBROADCASTSD(posvel[0].ypos, YMM01); VBROADCASTSD(posvel[0].zpos, YMM02); VBROADCASTSD(posvel[1].xpos, YMM03); VBROADCASTSD(posvel[1].ypos, YMM04); VBROADCASTSD(posvel[1].zpos, YMM05); VBROADCASTSS(posvel[0].xvel, XMM06); VBROADCASTSS(posvel[1].xvel, XMM07); VMERGE(YMM06, YMM07, YMM06); VBROADCASTSS(posvel[0].yvel, XMM08); VBROADCASTSS(posvel[1].yvel, XMM09); VMERGE(YMM08, YMM09, YMM07); VBROADCASTSS(posvel[0].zvel, XMM10); VBROADCASTSS(posvel[1].zvel, XMM11); VMERGE(YMM10, YMM11, YMM08); VBROADCASTSS(posvel[0].id, XMM12); VBROADCASTSS(posvel[1].id, XMM13); VMERGE(YMM12, YMM13, YMM09); VBROADCASTSS(posvel[0].eps2, XMM14); VBROADCASTSS(posvel[1].eps2, XMM15); VMERGE(YMM14, YMM15, YMM10); VSTORPD(YMM00, iptr->xpos0[0]); VSTORPD(YMM01, iptr->ypos0[0]); VSTORPD(YMM02, iptr->zpos0[0]); VSTORPD(YMM03, iptr->xpos1[0]); VSTORPD(YMM04, iptr->ypos1[0]); VSTORPD(YMM05, iptr->zpos1[0]); VSTORPS(YMM06, iptr->xvel01[0]); VSTORPS(YMM07, iptr->yvel01[0]); VSTORPS(YMM08, iptr->zvel01[0]); VSTORPS(YMM09, iptr->id01[0]); VSTORPS(YMM10, iptr->veps2[0]); VZEROALL; for(j = 0; j < nj; j += JPARA, jptr++){ // if nj % 2 != 0 ATARU // dx -> YMM03 VLOADPD(jptr->xpos[0], YMM00); VSUBPD_M(iptr->xpos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->xpos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM03); // dy -> YMM04 VLOADPD(jptr->ypos[0], YMM00); VSUBPD_M(iptr->ypos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->ypos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM04); // dz -> YMM05 VLOADPD(jptr->zpos[0], YMM00); VSUBPD_M(iptr->zpos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->zpos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM05); // dr^2 VLOADPS(iptr->veps2[0], YMM01); VFMADDPS(YMM01, YMM03, YMM03); VFMADDPS(YMM01, YMM04, YMM04); VFMADDPS(YMM01, YMM05, YMM05); // - 2 / r -> YMM01 VRSQRTPS(YMM01, YMM02); VMULPS(YMM02, YMM01, YMM01); VFMSUB213PS_M(three[0], YMM02, YMM01); VMULPS(YMM02, YMM01, YMM01); // mask VLOADPS(jptr->indx[0], YMM02); VLOADPS(iptr->id01[0], YMM00); VCMPNEQPS(YMM00, YMM02, YMM02); VANDPS(YMM02, YMM01, YMM01); // potential VMULPS_M(jptr->mass[0], YMM01, YMM02); VCVTPS2PD(XMM02, YMM00); VUP2LOW(YMM02, XMM06); VCVTPS2PD(XMM06, YMM06); VHADDPD(YMM06, YMM00, YMM07); VADDPD(YMM07, YMM09, YMM09); // dvx, dvy, dvz (vj - vi) VLOADPS(jptr->xvel[0], YMM06); VSUBPS_M(iptr->xvel01[0], YMM06, YMM06); VLOADPS(jptr->yvel[0], YMM07); VSUBPS_M(iptr->yvel01[0], YMM07, YMM07); VLOADPS(jptr->zvel[0], YMM08); VSUBPS_M(iptr->zvel01[0], YMM08, YMM08); // xv -> YMM00 VMULPS(YMM03, YMM06, YMM00); VFMADDPS(YMM00, YMM04, YMM07); VFMADDPS(YMM00, YMM05, YMM08); // YMM00: 3.0 * xv / r^2, YMM02: - m / r^3 VMULPS_M(jptr->mass[0], YMM01, YMM02); VMULPS(YMM01, YMM01, YMM01); VMULPS(YMM01, YMM00, YMM00); VMULPS(YMM01, YMM02, YMM02); VMULPS_M(threefourth[0], YMM00, YMM00); // prefetch PREFETCH((jptr+1)->xpos[0]); PREFETCH((jptr+1)->zpos[0]); PREFETCH((jptr+1)->mass[0]); PREFETCH((jptr+1)->yvel[0]); // jx1, jy1, jz1 VFMADDPS(YMM13, YMM02, YMM06); VFMADDPS(YMM14, YMM02, YMM07); VFMADDPS(YMM15, YMM02, YMM08); // ax VMULPS(YMM02, YMM03, YMM03); VCVTPS2PD(XMM03, YMM06); VUP2LOW(YMM03, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM10, YMM10); // ay VMULPS(YMM02, YMM04, YMM04); VCVTPS2PD(XMM04, YMM06); VUP2LOW(YMM04, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM11, YMM11); // az VMULPS(YMM02, YMM05, YMM05); VCVTPS2PD(XMM05, YMM06); VUP2LOW(YMM05, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM12, YMM12); // jx2, jy2, jz2 VFNMADDPS(YMM13, YMM00, YMM03); VFNMADDPS(YMM14, YMM00, YMM04); VFNMADDPS(YMM15, YMM00, YMM05); } VSTORPD(YMM09, iptr->pot[0]); VSTORPD(YMM10, iptr->xacc[0]); VSTORPD(YMM11, iptr->yacc[0]); VSTORPD(YMM12, iptr->zacc[0]); VSTORPS(YMM13, iptr->xjrk[0]); VSTORPS(YMM14, iptr->yjrk[0]); VSTORPS(YMM15, iptr->zjrk[0]); VZEROUPPER; accjerk[0].xacc = iptr->xacc[0] + iptr->xacc[2]; accjerk[0].yacc = iptr->yacc[0] + iptr->yacc[2]; accjerk[0].zacc = iptr->zacc[0] + iptr->zacc[2]; accjerk[0].pot = iptr->pot[0] + iptr->pot[2]; accjerk[0].xjrk = iptr->xjrk[0] + iptr->xjrk[1] + iptr->xjrk[2] + iptr->xjrk[3]; accjerk[0].yjrk = iptr->yjrk[0] + iptr->yjrk[1] + iptr->yjrk[2] + iptr->yjrk[3]; accjerk[0].zjrk = iptr->zjrk[0] + iptr->zjrk[1] + iptr->zjrk[2] + iptr->zjrk[3]; accjerk[1].xacc = iptr->xacc[1] + iptr->xacc[3]; accjerk[1].yacc = iptr->yacc[1] + iptr->yacc[3]; accjerk[1].zacc = iptr->zacc[1] + iptr->zacc[3]; accjerk[1].pot = iptr->pot[1] + iptr->pot[3]; accjerk[1].xjrk = iptr->xjrk[4] + iptr->xjrk[5] + iptr->xjrk[6] + iptr->xjrk[7]; accjerk[1].yjrk = iptr->yjrk[4] + iptr->yjrk[5] + iptr->yjrk[6] + iptr->yjrk[7]; accjerk[1].zjrk = iptr->zjrk[4] + iptr->zjrk[5] + iptr->zjrk[6] + iptr->zjrk[7]; free(iptr); return; }
runtime·mallocgc ( uintptr size , uintptr typ , uint32 flag ) { int32 sizeclass; uintptr tinysize , size1; intgo rate; MCache *c; MSpan *s; MLink *v , *next; byte *tiny; #line 49 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" if ( size == 0 ) { #line 53 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" return &runtime·zerobase; } if ( m->mallocing ) runtime·throw ( "malloc/free - deadlock" ) ; #line 59 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" m->locks++; m->mallocing = 1; #line 62 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" if ( DebugTypeAtBlockEnd ) size += sizeof ( uintptr ) ; #line 65 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" c = m->mcache; if ( !runtime·debug.efence && size <= MaxSmallSize ) { if ( ( flag& ( FlagNoScan|FlagNoGC ) ) == FlagNoScan && size < TinySize ) { #line 98 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" tinysize = c->tinysize; if ( size <= tinysize ) { tiny = c->tiny; #line 102 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" if ( ( size&7 ) == 0 ) tiny = ( byte* ) ROUND ( ( uintptr ) tiny , 8 ) ; else if ( ( size&3 ) == 0 ) tiny = ( byte* ) ROUND ( ( uintptr ) tiny , 4 ) ; else if ( ( size&1 ) == 0 ) tiny = ( byte* ) ROUND ( ( uintptr ) tiny , 2 ) ; size1 = size + ( tiny - c->tiny ) ; if ( size1 <= tinysize ) { #line 111 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" v = ( MLink* ) tiny; c->tiny += size1; c->tinysize -= size1; m->mallocing = 0; m->locks--; if ( m->locks == 0 && g->preempt ) g->stackguard0 = StackPreempt; return v; } } #line 122 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" s = c->alloc[TinySizeClass]; if ( s->freelist == nil ) s = runtime·MCache_Refill ( c , TinySizeClass ) ; v = s->freelist; next = v->next; s->freelist = next; s->ref++; if ( next != nil ) PREFETCH ( next ) ; ( ( uint64* ) v ) [0] = 0; ( ( uint64* ) v ) [1] = 0; #line 135 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" if ( TinySize-size > tinysize ) { c->tiny = ( byte* ) v + size; c->tinysize = TinySize - size; } size = TinySize; goto done; } #line 144 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" if ( size <= 1024-8 ) sizeclass = runtime·size_to_class8[ ( size+7 ) >>3]; else sizeclass = runtime·size_to_class128[ ( size-1024+127 ) >> 7]; size = runtime·class_to_size[sizeclass]; s = c->alloc[sizeclass]; if ( s->freelist == nil ) s = runtime·MCache_Refill ( c , sizeclass ) ; v = s->freelist; next = v->next; s->freelist = next; s->ref++; if ( next != nil ) PREFETCH ( next ) ; if ( ! ( flag & FlagNoZero ) ) { v->next = nil; #line 161 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc" if ( size > 2*sizeof ( uintptr ) && ( ( uintptr* ) v ) [1] != 0 ) runtime·memclr ( ( byte* ) v , size ) ; } done: c->local_cachealloc += size; } else {
// Returns hits with _AT_MOST_ numMismatches mistakes. bool positionDB::getUpToNMismatches(uint64 mer, uint32 numMismatches, uint64*& posn, uint64& posnMax, uint64& posnLen) { PREFETCH(_hashedErrors); // Slightly better. posnLen = 0; if (_hashedErrors == 0L) { fprintf(stderr, "ERROR: Nobody initialized getUpToNMismatches() by calling setUpMismatchMatcher().\n"); exit(1); } if (posnMax == 0) { posnMax = 16384; try { posn = new uint64 [posnMax]; } catch (...) { fprintf(stderr, "positionDB::getUpToNMismatches()-- Can't allocate space for initial positions, requested "uint64FMT" uint64's.\n", posnMax); abort(); } } uint64 orig = HASH(mer); // Optimization that didn't work. The idea was to compute all the // hashes with errors, then sort to gain better cache locality in // the lookups. The sort dominated. // // Another: Surprisingly, theq two getDecodedValue calls are faster // than a single getDecodedValues. for (uint32 e=0; e<_hashedErrorsLen; e++) { uint64 hash = orig ^ _hashedErrors[e]; uint64 st, ed; if (_hashTable_BP) { st = getDecodedValue(_hashTable_BP, hash * _hashWidth, _hashWidth); ed = getDecodedValue(_hashTable_BP, hash * _hashWidth + _hashWidth, _hashWidth); } else { st = _hashTable_FW[hash]; ed = _hashTable_FW[hash+1]; } assert((_hashedErrors[e] & ~_hashMask) == 0); assert((hash & ~_hashMask) == 0); // Rebuild the mer from the hash and its check code. // // Compare the rebuilt mer and the original mer -- if there are // exactly N errors, it's a hit! (if there are fewer than N, // we'll find it when we look for N-1 errors). // // Before rebuilding, compute diffs on the chckBits only -- if // things are wildly different (the usual case) we'll get // enough difference here to abort. Remember, the chck bits // are not encoded, they're an exact copy from the unhashed // mer. if (st != ed) { for (uint64 i=ed-st, J=st * _wFin; i--; J += _wFin) { uint64 chck = getDecodedValue(_buckets, J, _chckWidth); uint64 diffs = chck ^ (mer & _mask2); uint64 d1 = diffs & uint64NUMBER(0x5555555555555555); uint64 d2 = diffs & uint64NUMBER(0xaaaaaaaaaaaaaaaa); uint64 err = countNumberOfSetBits64(d1 | (d2 >> 1)); if (err <= numMismatches) { diffs = REBUILD(hash, chck) ^ mer; d1 = diffs & uint64NUMBER(0x5555555555555555); d2 = diffs & uint64NUMBER(0xaaaaaaaaaaaaaaaa); err = countNumberOfSetBits64(d1 | (d2 >> 1)); if (err <= numMismatches) // err is junk, just need a parameter here loadPositions(J, posn, posnMax, posnLen, err); } } } }
void gravity_kernel2(int nj, pPrdPosVel posvel, pNewAccJrk accjerk) { int ret; int j; double true_rmin2; pPred_Mem jptr = pred_mem; pIparticle iptr; float ten = 10.0, minusone = -1.0; ret = posix_memalign((void **)&iptr, 32, NVAR_IP * 32); assert(ret == 0); VBROADCASTSD(posvel[0].xpos, YMM00); VBROADCASTSD(posvel[0].ypos, YMM01); VBROADCASTSD(posvel[0].zpos, YMM02); VBROADCASTSD(posvel[1].xpos, YMM03); VBROADCASTSD(posvel[1].ypos, YMM04); VBROADCASTSD(posvel[1].zpos, YMM05); VBROADCASTSS(posvel[0].xvel, XMM06); VBROADCASTSS(posvel[1].xvel, XMM07); VMERGE(YMM06, YMM07, YMM06); VBROADCASTSS(posvel[0].yvel, XMM08); VBROADCASTSS(posvel[1].yvel, XMM09); VMERGE(YMM08, YMM09, YMM07); VBROADCASTSS(posvel[0].zvel, XMM10); VBROADCASTSS(posvel[1].zvel, XMM11); VMERGE(YMM10, YMM11, YMM08); VBROADCASTSS(posvel[0].id, XMM12); VBROADCASTSS(posvel[1].id, XMM13); VMERGE(YMM12, YMM13, YMM09); VBROADCASTSS(posvel[0].eps2, XMM14); VBROADCASTSS(posvel[1].eps2, XMM15); VMERGE(YMM14, YMM15, YMM10); VBROADCASTSS(ten, YMM11); VBROADCASTSS(minusone, YMM12); VSTORPD(YMM00, iptr->xpos0[0]); VSTORPD(YMM01, iptr->ypos0[0]); VSTORPD(YMM02, iptr->zpos0[0]); VSTORPD(YMM03, iptr->xpos1[0]); VSTORPD(YMM04, iptr->ypos1[0]); VSTORPD(YMM05, iptr->zpos1[0]); VSTORPS(YMM06, iptr->xvel01[0]); VSTORPS(YMM07, iptr->yvel01[0]); VSTORPS(YMM08, iptr->zvel01[0]); VSTORPS(YMM09, iptr->id01[0]); VSTORPS(YMM10, iptr->veps2[0]); VSTORPS(YMM11, iptr->rmin2[0]); VSTORPS(YMM12, iptr->in[0]); VZEROALL; for(j = 0; j < nj; j += JPARA, jptr++){ // if nj % 2 != 0 ATARU // dx -> YMM03 VLOADPD(jptr->xpos[0], YMM00); VSUBPD_M(iptr->xpos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->xpos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM03); // dy -> YMM04 VLOADPD(jptr->ypos[0], YMM00); VSUBPD_M(iptr->ypos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->ypos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM04); // dz -> YMM05 VLOADPD(jptr->zpos[0], YMM00); VSUBPD_M(iptr->zpos0[0], YMM00, YMM01); VCVTPD2PS(YMM01, XMM01); VSUBPD_M(iptr->zpos1[0], YMM00, YMM02); VCVTPD2PS(YMM02, XMM02); VMERGE(YMM01, YMM02, YMM05); // dr^2 VLOADPS(iptr->veps2[0], YMM01); VMULPS(YMM03, YMM03, YMM00); VADDPS(YMM00, YMM01, YMM01); VMULPS(YMM04, YMM04, YMM00); VADDPS(YMM00, YMM01, YMM01); VMULPS(YMM05, YMM05, YMM00); VADDPS(YMM00, YMM01, YMM01); // - 2 / r -> YMM01 VRSQRTPS(YMM01, YMM02); VMULPS(YMM02, YMM01, YMM01); VMULPS(YMM02, YMM01, YMM01); VSUBPS_M(three[0], YMM01, YMM01); VMULPS(YMM02, YMM01, YMM01); // mask VLOADPS(jptr->indx[0], YMM02); VLOADPS(iptr->id01[0], YMM00); VCMPNEQPS(YMM00, YMM02, YMM02); VANDPS(YMM02, YMM01, YMM01); // nearest neighbour (free: YMM00, YMM02, YMM06, YMM07, YMM08) VLOADPS(iptr->rmin2[0], YMM00); VMINPS(YMM01, YMM00, YMM02); VSTORPS(YMM02, iptr->rmin2[0]); VCMPPS(YMM01, YMM00, YMM02, GT); VLOADPS(jptr->indx[0], YMM06); VANDPS(YMM02, YMM06, YMM07); VCMPPS(YMM01, YMM00, YMM08, LE); VANDPS_M(iptr->in[0], YMM08, YMM08); VADDPS(YMM08, YMM07, YMM07); VSTORPS(YMM07, iptr->in[0]); // potential VMULPS_M(jptr->mass[0], YMM01, YMM02); VCVTPS2PD(XMM02, YMM00); VUP2LOW(YMM02, XMM06); VCVTPS2PD(XMM06, YMM06); VHADDPD(YMM06, YMM00, YMM07); VADDPD(YMM07, YMM09, YMM09); // dvx, dvy, dvz (vj - vi) VLOADPS(jptr->xvel[0], YMM06); VSUBPS_M(iptr->xvel01[0], YMM06, YMM06); VLOADPS(jptr->yvel[0], YMM07); VSUBPS_M(iptr->yvel01[0], YMM07, YMM07); VLOADPS(jptr->zvel[0], YMM08); VSUBPS_M(iptr->zvel01[0], YMM08, YMM08); // xv -> YMM00 VMULPS(YMM03, YMM06, YMM00); VMULPS(YMM04, YMM07, YMM02); VADDPS(YMM02, YMM00, YMM00); VMULPS(YMM05, YMM08, YMM02); VADDPS(YMM02, YMM00, YMM00); // YMM00: 3.0 * xv / r^2, YMM02: - m / r^3 VMULPS_M(jptr->mass[0], YMM01, YMM02); VMULPS(YMM01, YMM01, YMM01); VMULPS(YMM01, YMM00, YMM00); VMULPS(YMM01, YMM02, YMM02); VMULPS_M(threefourth[0], YMM00, YMM00); // prefetch PREFETCH((jptr+1)->xpos[0]); PREFETCH((jptr+1)->zpos[0]); PREFETCH((jptr+1)->mass[0]); PREFETCH((jptr+1)->yvel[0]); // jx1 VMULPS(YMM02, YMM06, YMM06); VADDPS(YMM06, YMM13, YMM13); // jy1 VMULPS(YMM02, YMM07, YMM07); VADDPS(YMM07, YMM14, YMM14); // jz1 VMULPS(YMM02, YMM08, YMM08); VADDPS(YMM08, YMM15, YMM15); // ax VMULPS(YMM02, YMM03, YMM03); VCVTPS2PD(XMM03, YMM06); VUP2LOW(YMM03, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM10, YMM10); // ay VMULPS(YMM02, YMM04, YMM04); VCVTPS2PD(XMM04, YMM06); VUP2LOW(YMM04, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM11, YMM11); // az VMULPS(YMM02, YMM05, YMM05); VCVTPS2PD(XMM05, YMM06); VUP2LOW(YMM05, XMM07); VCVTPS2PD(XMM07, YMM07); VHADDPD(YMM07, YMM06, YMM06); VADDPD(YMM06, YMM12, YMM12); // jx2 VMULPS(YMM00, YMM03, YMM03); VSUBPS(YMM03, YMM13, YMM13); // jy2 VMULPS(YMM00, YMM04, YMM04); VSUBPS(YMM04, YMM14, YMM14); // jz2 VMULPS(YMM00, YMM05, YMM05); VSUBPS(YMM05, YMM15, YMM15); } VSTORPD(YMM09, iptr->pot[0]); VSTORPD(YMM10, iptr->xacc[0]); VSTORPD(YMM11, iptr->yacc[0]); VSTORPD(YMM12, iptr->zacc[0]); VSTORPS(YMM13, iptr->xjrk[0]); VSTORPS(YMM14, iptr->yjrk[0]); VSTORPS(YMM15, iptr->zjrk[0]); accjerk[0].xacc = iptr->xacc[0] + iptr->xacc[2]; accjerk[0].yacc = iptr->yacc[0] + iptr->yacc[2]; accjerk[0].zacc = iptr->zacc[0] + iptr->zacc[2]; accjerk[0].pot = iptr->pot[0] + iptr->pot[2]; accjerk[0].xjrk = iptr->xjrk[0] + iptr->xjrk[1] + iptr->xjrk[2] + iptr->xjrk[3]; accjerk[0].yjrk = iptr->yjrk[0] + iptr->yjrk[1] + iptr->yjrk[2] + iptr->yjrk[3]; accjerk[0].zjrk = iptr->zjrk[0] + iptr->zjrk[1] + iptr->zjrk[2] + iptr->zjrk[3]; for(true_rmin2 = 1e30, j = 0; j < JPARA; j++){ if(iptr->rmin2[j] < true_rmin2){ true_rmin2 = iptr->rmin2[j]; accjerk[0].rnnb = - 2.0 / true_rmin2; accjerk[0].nnb = (int)iptr->in[j]; } } accjerk[1].xacc = iptr->xacc[1] + iptr->xacc[3]; accjerk[1].yacc = iptr->yacc[1] + iptr->yacc[3]; accjerk[1].zacc = iptr->zacc[1] + iptr->zacc[3]; accjerk[1].pot = iptr->pot[1] + iptr->pot[3]; accjerk[1].xjrk = iptr->xjrk[4] + iptr->xjrk[5] + iptr->xjrk[6] + iptr->xjrk[7]; accjerk[1].yjrk = iptr->yjrk[4] + iptr->yjrk[5] + iptr->yjrk[6] + iptr->yjrk[7]; accjerk[1].zjrk = iptr->zjrk[4] + iptr->zjrk[5] + iptr->zjrk[6] + iptr->zjrk[7]; for(true_rmin2 = 1e30, j = 4; j < 4 + JPARA; j++){ if(iptr->rmin2[j] < true_rmin2){ true_rmin2 = iptr->rmin2[j]; accjerk[1].rnnb = - 2.0 / true_rmin2; accjerk[1].nnb = (int)iptr->in[j]; } } free(iptr); return; }
static void collector_trace_rootsets(Collector* collector) { GC* gc = collector->gc; GC_Metadata* metadata = gc->metadata; #ifdef GC_GEN_STATS GC_Gen_Collector_Stats* stats = (GC_Gen_Collector_Stats*)collector->stats; #endif unsigned int num_active_collectors = gc->num_active_collectors; atomic_cas32( &num_finished_collectors, 0, num_active_collectors); Space* space = collector->collect_space; collector->trace_stack = free_task_pool_get_entry(metadata); /* find root slots saved by 1. active mutators, 2. exited mutators, 3. last cycle collectors */ Vector_Block* root_set = pool_iterator_next(metadata->gc_rootset_pool); /* first step: copy all root objects to trace tasks. */ TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: copy root objects to trace stack ..."); while(root_set){ POINTER_SIZE_INT* iter = vector_block_iterator_init(root_set); while(!vector_block_iterator_end(root_set,iter)){ REF *p_ref = (REF *)*iter; iter = vector_block_iterator_advance(root_set, iter); assert(*p_ref); /* root ref cann't be NULL, but remset can be */ collector_tracestack_push(collector, p_ref); #ifdef GC_GEN_STATS gc_gen_collector_update_rootset_ref_num(stats); #endif } root_set = pool_iterator_next(metadata->gc_rootset_pool); } /* put back the last trace_stack task */ pool_put_entry(metadata->mark_task_pool, collector->trace_stack); /* second step: iterate over the trace tasks and forward objects */ collector->trace_stack = free_task_pool_get_entry(metadata); TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish copying root objects to trace stack."); TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: trace and forward objects ..."); retry: Vector_Block* trace_task = pool_get_entry(metadata->mark_task_pool); while(trace_task){ POINTER_SIZE_INT* iter = vector_block_iterator_init(trace_task); while(!vector_block_iterator_end(trace_task,iter)){ REF *p_ref = (REF *)*iter; iter = vector_block_iterator_advance(trace_task, iter); #ifdef PREFETCH_SUPPORTED /* DO PREFETCH */ if( mark_prefetch ) { if(!vector_block_iterator_end(trace_task, iter)) { REF *pref= (REF*) *iter; PREFETCH( read_slot(pref)); } } #endif trace_object(collector, p_ref); if(collector->result == FALSE) break; /* force return */ } vector_stack_clear(trace_task); pool_put_entry(metadata->free_task_pool, trace_task); if(collector->result == FALSE){ gc_task_pool_clear(metadata->mark_task_pool); break; /* force return */ } trace_task = pool_get_entry(metadata->mark_task_pool); } /* A collector comes here when seeing an empty mark_task_pool. The last collector will ensure all the tasks are finished.*/ atomic_inc32(&num_finished_collectors); while(num_finished_collectors != num_active_collectors){ if( pool_is_empty(metadata->mark_task_pool)) continue; /* we can't grab the task here, because of a race condition. If we grab the task, and the pool is empty, other threads may fall to this barrier and then pass. */ atomic_dec32(&num_finished_collectors); goto retry; } TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish tracing and forwarding objects."); /* now we are done, but each collector has a private stack that is empty */ trace_task = (Vector_Block*)collector->trace_stack; vector_stack_clear(trace_task); pool_put_entry(metadata->free_task_pool, trace_task); collector->trace_stack = NULL; return; }
static void collector_trace_rootsets(Collector* collector) { GC* gc = collector->gc; GC_Metadata* metadata = gc->metadata; #ifdef GC_GEN_STATS GC_Gen_Collector_Stats* stats = (GC_Gen_Collector_Stats*)collector->stats; #endif unsigned int num_active_collectors = gc->num_active_collectors; atomic_cas32( &num_finished_collectors, 0, num_active_collectors); Space* space = collector->collect_space; collector->trace_stack = free_task_pool_get_entry(metadata); /* find root slots saved by 1. active mutators, 2. exited mutators, 3. last cycle collectors */ Vector_Block* root_set = pool_iterator_next(metadata->gc_rootset_pool); /* first step: copy all root objects to trace tasks. */ TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: copy root objects to trace stack ......"); while(root_set){ POINTER_SIZE_INT* iter = vector_block_iterator_init(root_set); while(!vector_block_iterator_end(root_set,iter)){ REF *p_ref = (REF *)*iter; iter = vector_block_iterator_advance(root_set,iter); if(!*p_ref) continue; /* root ref cann't be NULL, but remset can be */ Partial_Reveal_Object *p_obj = read_slot(p_ref); #ifdef GC_GEN_STATS gc_gen_collector_update_rootset_ref_num(stats); #endif if(obj_belongs_to_nos(p_obj)){ collector_tracestack_push(collector, p_ref); } } root_set = pool_iterator_next(metadata->gc_rootset_pool); } /* put back the last trace_stack task */ pool_put_entry(metadata->mark_task_pool, collector->trace_stack); /* second step: iterate over the trace tasks and forward objects */ collector->trace_stack = free_task_pool_get_entry(metadata); TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish copying root objects to trace stack."); TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: trace and forward objects ......"); retry: Vector_Block* trace_task = pool_get_entry(metadata->mark_task_pool); while(trace_task){ POINTER_SIZE_INT* iter = vector_block_iterator_init(trace_task); while(!vector_block_iterator_end(trace_task,iter)){ REF *p_ref = (REF *)*iter; iter = vector_block_iterator_advance(trace_task,iter); assert(*p_ref); /* a task can't be NULL, it was checked before put into the task stack */ #ifdef PREFETCH_SUPPORTED /* DO PREFETCH */ if( mark_prefetch ) { if(!vector_block_iterator_end(trace_task, iter)) { REF *pref= (REF*) *iter; PREFETCH( read_slot(pref)); } } #endif /* in sequential version, we only trace same object once, but we were using a local hashset for that, which couldn't catch the repetition between multiple collectors. This is subject to more study. */ /* FIXME:: we should not let root_set empty during working, other may want to steal it. degenerate my stack into root_set, and grab another stack */ /* a task has to belong to collected space, it was checked before put into the stack */ trace_object(collector, p_ref); if(collector->result == FALSE) break; /* force return */ } vector_stack_clear(trace_task); pool_put_entry(metadata->free_task_pool, trace_task); if(collector->result == FALSE){ gc_task_pool_clear(metadata->mark_task_pool); break; /* force return */ } trace_task = pool_get_entry(metadata->mark_task_pool); } atomic_inc32(&num_finished_collectors); while(num_finished_collectors != num_active_collectors){ if( pool_is_empty(metadata->mark_task_pool)) continue; /* we can't grab the task here, because of a race condition. If we grab the task, and the pool is empty, other threads may fall to this barrier and then pass. */ atomic_dec32(&num_finished_collectors); goto retry; } TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish tracing and forwarding objects."); /* now we are done, but each collector has a private stack that is empty */ trace_task = (Vector_Block*)collector->trace_stack; vector_stack_clear(trace_task); pool_put_entry(metadata->free_task_pool, trace_task); collector->trace_stack = NULL; return; }
void GravityKernel0(pIpdata ipdata, pFodata fodata, pJpdata0 jpdata, int nj) { int j; PREFETCH(jpdata[0]); VZEROALL; VLOADPS(*ipdata->x, XMM04); VLOADPS(*ipdata->y, XMM05); VLOADPS(*ipdata->z, XMM06); VLOADPS(*ipdata->eps2, XMM15); VPERM2F128(XI, XI, XI, 0x00); VPERM2F128(YI, YI, YI, 0x00); VPERM2F128(ZI, ZI, ZI, 0x00); VPERM2F128(EPSI2, EPSI2, EPSI2, 0x00); VLOADPS(jpdata->xm[0][0], Z2); VADDPS_M(jpdata->ep[0][0], EPSI2, EPSJ2); jpdata++; VSHUFPS(Z2, Z2, X2, 0x00); VSHUFPS(Z2, Z2, MJ, 0xff); VSHUFPS(Z2, Z2, Y2, 0x55); VSHUFPS(Z2, Z2, Z2, 0xaa); for(j = 0; j < nj; j += 2){ VSUBPS(XI, X2, DX); VSUBPS(ZI, Z2, DZ); VSUBPS(YI, Y2, DY); VMULPS(DX, DX, X2); VMULPS(DZ, DZ, Z2); VMULPS(DY, DY, Y2); VADDPS(X2, Z2, X2); VADDPS(EPSJ2, Y2, Y2); VADDPS(X2, Y2, Y2); VLOADPS(jpdata->xm[0][0], Z2); VADDPS_M(jpdata->ep[0][0], EPSI2, EPSJ2); jpdata++; VRSQRTPS(Y2, X2); VMULPS(X2, MJ, MJ); VMULPS(X2, X2, Y2); VMULPS(MJ, Y2, Y2); VSUBPS(MJ, PHI, PHI); VMULPS(Y2, DX, DX); VMULPS(Y2, DY, DY); VMULPS(Y2, DZ, DZ); VSHUFPS(Z2, Z2, X2, 0x00); VSHUFPS(Z2, Z2, MJ, 0xff); VSHUFPS(Z2, Z2, Y2, 0x55); VSHUFPS(Z2, Z2, Z2, 0xaa); VADDPS(DX, AX, AX); VADDPS(DY, AY, AY); VADDPS(DZ, AZ, AZ); } VEXTRACTF128(AX, XMM00, 0x01); VADDPS(AX, YMM00, AX); VEXTRACTF128(AY, XMM01, 0x01); VADDPS(AY, YMM01, AY); VEXTRACTF128(AZ, XMM02, 0x01); VADDPS(AZ, YMM02, AZ); VEXTRACTF128(PHI, XMM03, 0x01); VADDPS(PHI, YMM03, PHI); VSTORPS(XMM08, *fodata->ax); VSTORPS(XMM09, *fodata->ay); VSTORPS(XMM10, *fodata->az); VSTORPS(XMM11, *fodata->phi); }
void GravityKernel(pIpdata ipdata, pFodata fodata, pJpdata jpdata, int nj) { int j; PREFETCH(jpdata[0]); VZEROALL; VLOADPS(*ipdata->x, XMM04); VLOADPS(*ipdata->y, XMM05); VLOADPS(*ipdata->z, XMM06); VLOADPS(*ipdata->eps2, XMM15); VPERM2F128(XI, XI, XI, 0x00); VPERM2F128(YI, YI, YI, 0x00); VPERM2F128(ZI, ZI, ZI, 0x00); VPERM2F128(EPS2, EPS2, EPS2, 0x00); #if (2 == NUNROLL) VLOADPS(*(jpdata), J1); jpdata += 2; VSHUFPS(J1, J1, X2, 0x00); VSHUFPS(J1, J1, J2, 0xaa); VSHUFPS(J1, J1, MJ, 0xff); VSHUFPS(J1, J1, Y2, 0x55); for(j = 0; j < nj; j += 2){ VSUBPS(XI, X2, DX); VSUBPS(ZI, J2, DZ); VSUBPS(YI, Y2, DY); VLOADPS(*(jpdata), J1); jpdata += 2; VMULPS(DX, DX, X2); VMULPS(DZ, DZ, J2); VMULPS(DY, DY, Y2); VADDPS(X2, J2, J2); VADDPS(EPS2, Y2, Y2); VADDPS(J2, Y2, Y2); VRSQRTPS(Y2, X2); VMULPS(X2, MJ, MJ); VMULPS(X2, X2, Y2); VMULPS(MJ, Y2, Y2); VSUBPS(MJ, PHI, PHI); VMULPS(Y2, DX, DX); VMULPS(Y2, DY, DY); VMULPS(Y2, DZ, DZ); VSHUFPS(J1, J1, X2, 0x00); VSHUFPS(J1, J1, J2, 0xaa); VSHUFPS(J1, J1, MJ, 0xff); VSHUFPS(J1, J1, Y2, 0x55); VADDPS(DX, AX, AX); VADDPS(DY, AY, AY); VADDPS(DZ, AZ, AZ); } #elif (4 == NUNROLL) #if 1 VLOADPS(*(jpdata), J1); VLOADPS(*(jpdata+2), J2); jpdata += 4; VSHUFPS(J1, J1, X2, 0x00); VSHUFPS(J1, J1, Y2, 0x55); VSHUFPS(J1, J1, MJ, 0xff); VSHUFPS(J1, J1, J1, 0xaa); for(j = 0 ; j < nj; j += 4) { VSUBPS(XI, X2, DX); VSUBPS(YI, Y2, DY); VSUBPS(ZI, J1, DZ); VMULPS(DX, DX, X2); VMULPS(DZ, DZ, J1); VMULPS(DY, DY, Y2); VADDPS(J1, X2, X2); VADDPS(EPS2, Y2, Y2); VADDPS(Y2, X2, Y2); VLOADPS(*(jpdata), J1); VRSQRTPS(Y2, X2); VMULPS(X2, MJ, MJ); VMULPS(X2, X2, Y2); VMULPS(MJ, Y2, Y2); VSUBPS(MJ, PHI, PHI); VMULPS(Y2, DX, DX); VMULPS(Y2, DY, DY); VMULPS(Y2, DZ, DZ); VSHUFPS(J2, J2, X2, 0x00); VSHUFPS(J2, J2, MJ, 0xff); VSHUFPS(J2, J2, Y2, 0x55); VSHUFPS(J2, J2, J2, 0xaa); VADDPS(DX, AX, AX); VADDPS(DY, AY, AY); VADDPS(DZ, AZ, AZ); VSUBPS(XI, X2, DX); VSUBPS(YI, Y2, DY); VSUBPS(ZI, J2, DZ); VMULPS(DX, DX, X2); VMULPS(DZ, DZ, J2); VMULPS(DY, DY, Y2); VADDPS(J2, X2, X2); VADDPS(EPS2, Y2, Y2); VADDPS(Y2, X2, Y2); VLOADPS(*(jpdata+2), J2); VRSQRTPS(Y2, X2); VMULPS(X2, MJ, MJ); VMULPS(X2, X2, Y2); jpdata += 4; PREFETCH(*(jpdata)); VMULPS(MJ, Y2, Y2); VSUBPS(MJ, PHI, PHI); VMULPS(Y2, DX, DX); VMULPS(Y2, DY, DY); VMULPS(Y2, DZ, DZ); VSHUFPS(J1, J1, X2, 0x00); VSHUFPS(J1, J1, MJ, 0xff); VSHUFPS(J1, J1, Y2, 0x55); VSHUFPS(J1, J1, J1, 0xaa); VADDPS(DX, AX, AX); VADDPS(DY, AY, AY); VADDPS(DZ, AZ, AZ); } #else VLOADPS(*(jpdata), J1); VLOADPS(*(jpdata+2), J2); jpdata += 4; VSHUFPS(J1, J1, X2, 0x00); VSHUFPS(J1, J1, Y2, 0x55); VSHUFPS(J1, J1, MJ, 0xaa); VSHUFPS(J1, J1, J1, 0xff); for(j = 0 ; j < nj; j += 4) { VSUBPS(XI, X2, DX); VSUBPS(YI, Y2, DY); VSUBPS(ZI, MJ, DZ); VMULPS(DX, DX, X2); VMULPS(DY, DY, Y2); VMULPS(DZ, DZ, MJ); VADDPS(X2, Y2, Y2); VADDPS(EPS2, MJ, MJ); VADDPS(Y2, MJ, Y2); VRSQRTPS(Y2, X2); VMULPS(X2, J1, Y2); VMULPS(X2, X2, X2); VLOADPS(*(jpdata), J1); VSUBPS(Y2, PHI, PHI); VMULPS(X2, Y2, Y2); VMULPS(Y2, DX, DX); VMULPS(Y2, DY, DY); VMULPS(Y2, DZ, DZ); VSHUFPS(J2, J2, X2, 0x00); VSHUFPS(J2, J2, Y2, 0x55); VSHUFPS(J2, J2, MJ, 0xaa); VSHUFPS(J2, J2, J2, 0xff); VADDPS(DX, AX, AX); VADDPS(DY, AY, AY); VADDPS(DZ, AZ, AZ); VSUBPS(XI, X2, DX); VSUBPS(YI, Y2, DY); VSUBPS(ZI, MJ, DZ); VMULPS(DX, DX, X2); VMULPS(DY, DY, Y2); VMULPS(DZ, DZ, MJ); VADDPS(X2, Y2, Y2); VADDPS(EPS2, MJ, MJ); VADDPS(Y2, MJ, Y2); VRSQRTPS(Y2, X2); VMULPS(X2, J2, Y2); VMULPS(X2, X2, X2); VLOADPS(*(jpdata+2), J2); jpdata += 4; PREFETCH(*(jpdata)); VSUBPS(Y2, PHI, PHI); VMULPS(X2, Y2, Y2); VMULPS(Y2, DX, DX); VMULPS(Y2, DY, DY); VMULPS(Y2, DZ, DZ); VSHUFPS(J1, J1, X2, 0x00); VSHUFPS(J1, J1, Y2, 0x55); VSHUFPS(J1, J1, MJ, 0xaa); VSHUFPS(J1, J1, J1, 0xff); VADDPS(DX, AX, AX); VADDPS(DY, AY, AY); VADDPS(DZ, AZ, AZ); } #endif #else #error #endif VEXTRACTF128(AX, XMM00, 0x01); VADDPS(AX, YMM00, AX); VEXTRACTF128(AY, XMM01, 0x01); VADDPS(AY, YMM01, AY); VEXTRACTF128(AZ, XMM02, 0x01); VADDPS(AZ, YMM02, AZ); VEXTRACTF128(PHI, XMM03, 0x01); VADDPS(PHI, YMM03, PHI); VSTORPS(XMM08, *fodata->ax); VSTORPS(XMM09, *fodata->ay); VSTORPS(XMM10, *fodata->az); VSTORPS(XMM11, *fodata->phi); }