Exemplo n.º 1
0
  //
  // Unsweep for a sight map
  //
  static void Unsweep(Map *map)
  {
    START(unSweepTime);

    PREFETCH(&__prefetch);

    // Last sweep position
    S32 tileX = map->lastX;
    S32 tileZ = map->lastZ;

    // Last sweep radius
    S32 r = map->lastR;

    // Get viewing mask for lower layer
    U8 *mapLo = map->GetByteMap(Map::LV_LO);
    U8 maskLo = map->GetBitMask(Map::LV_LO);

    // Dirty cells that line of sight has changed in
    DirtyCells(tileX - r, tileZ - r, tileX + r, tileZ + r, map->lastTeam);

    // iterate over all tiles within last scan radius
    S32 first = XZToSeemap(-r, -r);

    for (S32 y = -r; y <= r; y++, first += MAPSIDE)
    {
      PREFETCH(&mapLo[first+MAPSIDE]);

      for (S32 x = -r, index = first; x <= r; x++, index++)
      {
        // Unsweep tile on ground level if was swept
        if (mapLo[index] & maskLo)
        {
          // Unsweep for all teams last swept
          CantSee(tileX + x, tileZ + y, x, y, map->lastTeam, Map::LV_LO);

          // Clear seen bit
          mapLo[index] &= ~maskLo;
        }
      }
    }

    // Reset last radius and team mask
    map->lastR = 0;
    map->lastTeam = 0;
    map->lastAlt = F32_MAX;

    STOP(unSweepTime);
  }
Exemplo n.º 2
0
static void
TestAtomic64(void)
{
	uint64 z64, x64;

	z64 = 42;
	x64 = 0;
	PREFETCH(&z64);
	if(runtime_cas64(&z64, x64, 1))
		runtime_throw("cas64 failed");
	if(x64 != 0)
		runtime_throw("cas64 failed");
	x64 = 42;
	if(!runtime_cas64(&z64, x64, 1))
		runtime_throw("cas64 failed");
	if(x64 != 42 || z64 != 1)
		runtime_throw("cas64 failed");
	if(runtime_atomicload64(&z64) != 1)
		runtime_throw("load64 failed");
	runtime_atomicstore64(&z64, (1ull<<40)+1);
	if(runtime_atomicload64(&z64) != (1ull<<40)+1)
		runtime_throw("store64 failed");
	if(runtime_xadd64(&z64, (1ull<<40)+1) != (2ull<<40)+2)
		runtime_throw("xadd64 failed");
	if(runtime_atomicload64(&z64) != (2ull<<40)+2)
		runtime_throw("xadd64 failed");
	if(runtime_xchg64(&z64, (3ull<<40)+3) != (2ull<<40)+2)
		runtime_throw("xchg64 failed");
	if(runtime_atomicload64(&z64) != (3ull<<40)+3)
		runtime_throw("xchg64 failed");
}
Exemplo n.º 3
0
static void trace_object(Collector *collector, REF *p_ref)
{ 
  forward_object(collector, p_ref);

  Vector_Block* trace_stack = (Vector_Block*)collector->trace_stack;
  while( !vector_stack_is_empty(trace_stack)){
    p_ref = (REF *)vector_stack_pop(trace_stack); 
#ifdef PREFETCH_SUPPORTED
    /* DO PREFETCH */
   if(mark_prefetch) {
     if(!vector_stack_is_empty(trace_stack)) {
        REF *pref = (REF*)vector_stack_read(trace_stack, 0);
        PREFETCH( read_slot(pref) );
     }
   }
#endif    
    forward_object(collector, p_ref);
    trace_stack = (Vector_Block*)collector->trace_stack;
  }
  return; 
}
Exemplo n.º 4
0
void gravity_kernel2n(int nj, pPrdPosVel posvel, pNewAccJrk accjerk, int i, int ithread)
{
  int ret;
  int j;
  double true_rmin2;
  float hinv0, hinv1;
  pPred_Mem jptr = pred_mem;
  pIparticle iptr;
  pNeighbourList nbptr, nbptr0 = neighbour[ithread];
  float ten = 10.0, minusone = -1.0;

  if(posvel[0].h2 == 0.0)
    hinv0 = - 1e10;
  else
    hinv0 = - 2.0 / sqrt(posvel[0].h2);
  if(posvel[1].h2 == 0.0)
    hinv1 = - 1e10;
  else
    hinv1 = - 2.0 / sqrt(posvel[1].h2);

  ret = posix_memalign((void **)&iptr, 32, NVAR_IP * 32);
  assert(ret == 0);

  VBROADCASTSD(posvel[0].xpos, YMM00);
  VBROADCASTSD(posvel[0].ypos, YMM01);
  VBROADCASTSD(posvel[0].zpos, YMM02);

  VBROADCASTSD(posvel[1].xpos, YMM03);
  VBROADCASTSD(posvel[1].ypos, YMM04);
  VBROADCASTSD(posvel[1].zpos, YMM05);

  VBROADCASTSS(posvel[0].xvel, XMM06);
  VBROADCASTSS(posvel[1].xvel, XMM07);
  VMERGE(YMM06, YMM07, YMM06);

  VBROADCASTSS(posvel[0].yvel, XMM08);
  VBROADCASTSS(posvel[1].yvel, XMM09);
  VMERGE(YMM08, YMM09, YMM07);

  VBROADCASTSS(posvel[0].zvel, XMM10);
  VBROADCASTSS(posvel[1].zvel, XMM11);
  VMERGE(YMM10, YMM11, YMM08);

  VBROADCASTSS(posvel[0].id, XMM12);
  VBROADCASTSS(posvel[1].id, XMM13);
  VMERGE(YMM12, YMM13, YMM09);

  VBROADCASTSS(posvel[0].eps2, XMM14);
  VBROADCASTSS(posvel[1].eps2, XMM15);
  VMERGE(YMM14, YMM15, YMM10);

  VBROADCASTSS(hinv0, XMM11);
  VBROADCASTSS(hinv1, XMM12);
  VMERGE(YMM11, YMM12, YMM11);

  VBROADCASTSS(ten, YMM12);
  VBROADCASTSS(minusone, YMM13);

  VSTORPD(YMM00, iptr->xpos0[0]);
  VSTORPD(YMM01, iptr->ypos0[0]);
  VSTORPD(YMM02, iptr->zpos0[0]);
  VSTORPD(YMM03, iptr->xpos1[0]);
  VSTORPD(YMM04, iptr->ypos1[0]);
  VSTORPD(YMM05, iptr->zpos1[0]);
  VSTORPS(YMM06, iptr->xvel01[0]);
  VSTORPS(YMM07, iptr->yvel01[0]);
  VSTORPS(YMM08, iptr->zvel01[0]);
  VSTORPS(YMM09, iptr->id01[0]);
  VSTORPS(YMM10, iptr->veps2[0]);
  VSTORPS(YMM11, iptr->hinv[0]);
  VSTORPS(YMM12, iptr->rmin2[0]);
  VSTORPS(YMM13, iptr->in[0]);

  VZEROALL;
  for(j = 0, nbptr = nbptr0; j < nj; j += JPARA, jptr++, nbptr++){ // if nj % 2 != 0 ATARU
    // dx -> YMM03
    VLOADPD(jptr->xpos[0], YMM00);
    VSUBPD_M(iptr->xpos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->xpos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM03);
    // dy -> YMM04
    VLOADPD(jptr->ypos[0], YMM00);
    VSUBPD_M(iptr->ypos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->ypos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM04);
    // dz -> YMM05
    VLOADPD(jptr->zpos[0], YMM00);
    VSUBPD_M(iptr->zpos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->zpos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM05);
    // dr^2
    VLOADPS(iptr->veps2[0], YMM01);
    VFMADDPS(YMM01, YMM03, YMM03);
    VFMADDPS(YMM01, YMM04, YMM04);
    VFMADDPS(YMM01, YMM05, YMM05);
    // - 2 / r -> YMM01
    VRSQRTPS(YMM01, YMM02);
    VMULPS(YMM02, YMM01, YMM01);
    VFMSUB213PS_M(three[0], YMM02, YMM01);
    VMULPS(YMM02, YMM01, YMM01);
    // mask
    VLOADPS(jptr->indx[0], YMM02);
    VLOADPS(iptr->id01[0], YMM00);
    VCMPNEQPS(YMM00, YMM02, YMM02);
    VANDPS(YMM02, YMM01, YMM01);    
    // nearest neighbour (free: YMM00, YMM02, YMM06, YMM07, YMM08)
    VLOADPS(iptr->rmin2[0], YMM00);
    VMINPS(YMM01, YMM00, YMM02);
    VSTORPS(YMM02, iptr->rmin2[0]);
    VCMPPS(YMM01, YMM00, YMM02, GT);
    VLOADPS(jptr->indx[0], YMM06);
    VANDPS(YMM02, YMM06, YMM07);
    VCMPPS(YMM01, YMM00, YMM08, LE);
    VANDPS_M(iptr->in[0], YMM08, YMM08);
    VADDPS(YMM08, YMM07, YMM07);
    VSTORPS(YMM07, iptr->in[0]);
    // neighbour list
    VLOADPS(iptr->hinv[0], YMM00);
    VCMPPS(YMM00, YMM01, YMM00, LE);
    VLOADPS(flag[0], YMM02);
    VANDPS(YMM02, YMM00, YMM00);
    VSTORPS(YMM00, nbptr->flag[0]);
    // potential
    VMULPS_M(jptr->mass[0], YMM01, YMM02);
    VCVTPS2PD(XMM02, YMM00);
    VUP2LOW(YMM02, XMM06);
    VCVTPS2PD(XMM06, YMM06);
    VHADDPD(YMM06, YMM00, YMM07);
    VADDPD(YMM07, YMM09, YMM09);
    // dvx, dvy, dvz (vj - vi)
    VLOADPS(jptr->xvel[0], YMM06);
    VSUBPS_M(iptr->xvel01[0], YMM06, YMM06);
    VLOADPS(jptr->yvel[0], YMM07);
    VSUBPS_M(iptr->yvel01[0], YMM07, YMM07);
    VLOADPS(jptr->zvel[0], YMM08);
    VSUBPS_M(iptr->zvel01[0], YMM08, YMM08);
    // xv -> YMM00
    VMULPS(YMM03, YMM06, YMM00);
    VFMADDPS(YMM00, YMM04, YMM07);
    VFMADDPS(YMM00, YMM05, YMM08);
    // YMM00: 3.0 * xv / r^2, YMM02: - m / r^3
    VMULPS_M(jptr->mass[0], YMM01, YMM02);
    VMULPS(YMM01, YMM01, YMM01);
    VMULPS(YMM01, YMM00, YMM00);
    VMULPS(YMM01, YMM02, YMM02);
    VMULPS_M(threefourth[0], YMM00, YMM00);
    // prefetch
    PREFETCH((jptr+1)->xpos[0]);
    PREFETCH((jptr+1)->zpos[0]);
    PREFETCH((jptr+1)->mass[0]);
    PREFETCH((jptr+1)->yvel[0]);
    // jx1, jy1, jz1
    VFMADDPS(YMM13, YMM02, YMM06);
    VFMADDPS(YMM14, YMM02, YMM07);
    VFMADDPS(YMM15, YMM02, YMM08);
    // ax
    VMULPS(YMM02, YMM03, YMM03);
    VCVTPS2PD(XMM03, YMM06);
    VUP2LOW(YMM03, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM10, YMM10);
    // ay
    VMULPS(YMM02, YMM04, YMM04);
    VCVTPS2PD(XMM04, YMM06);
    VUP2LOW(YMM04, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM11, YMM11);
    // az
    VMULPS(YMM02, YMM05, YMM05);
    VCVTPS2PD(XMM05, YMM06);
    VUP2LOW(YMM05, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM12, YMM12);
    // jx2, jy2, jz2
    VFNMADDPS(YMM13, YMM00, YMM03);
    VFNMADDPS(YMM14, YMM00, YMM04);
    VFNMADDPS(YMM15, YMM00, YMM05);
  }

  VSTORPD(YMM09, iptr->pot[0]);
  VSTORPD(YMM10, iptr->xacc[0]);
  VSTORPD(YMM11, iptr->yacc[0]);
  VSTORPD(YMM12, iptr->zacc[0]);
  VSTORPS(YMM13, iptr->xjrk[0]);
  VSTORPS(YMM14, iptr->yjrk[0]);
  VSTORPS(YMM15, iptr->zjrk[0]);

  accjerk[0].xacc = iptr->xacc[0] + iptr->xacc[2];
  accjerk[0].yacc = iptr->yacc[0] + iptr->yacc[2];
  accjerk[0].zacc = iptr->zacc[0] + iptr->zacc[2];
  accjerk[0].pot  = iptr->pot[0]  + iptr->pot[2];
  accjerk[0].xjrk = iptr->xjrk[0] + iptr->xjrk[1] + iptr->xjrk[2] + iptr->xjrk[3];
  accjerk[0].yjrk = iptr->yjrk[0] + iptr->yjrk[1] + iptr->yjrk[2] + iptr->yjrk[3];
  accjerk[0].zjrk = iptr->zjrk[0] + iptr->zjrk[1] + iptr->zjrk[2] + iptr->zjrk[3];
  for(true_rmin2 = 1e30, j = 0; j < JPARA; j++){
    if(iptr->rmin2[j] < true_rmin2){
      true_rmin2    = iptr->rmin2[j];
      accjerk[0].rnnb = - 2.0 / true_rmin2;
      accjerk[0].nnb = (int)iptr->in[j];
    }
  }

  accjerk[1].xacc = iptr->xacc[1] + iptr->xacc[3];
  accjerk[1].yacc = iptr->yacc[1] + iptr->yacc[3];
  accjerk[1].zacc = iptr->zacc[1] + iptr->zacc[3];
  accjerk[1].pot  = iptr->pot[1]  + iptr->pot[3];
  accjerk[1].xjrk = iptr->xjrk[4] + iptr->xjrk[5] + iptr->xjrk[6] + iptr->xjrk[7];
  accjerk[1].yjrk = iptr->yjrk[4] + iptr->yjrk[5] + iptr->yjrk[6] + iptr->yjrk[7];
  accjerk[1].zjrk = iptr->zjrk[4] + iptr->zjrk[5] + iptr->zjrk[6] + iptr->zjrk[7];
  for(true_rmin2 = 1e30, j = 4; j < 4 + JPARA; j++){
    if(iptr->rmin2[j] < true_rmin2){
      true_rmin2    = iptr->rmin2[j];
      accjerk[1].rnnb = - 2.0 / true_rmin2;
      accjerk[1].nnb = (int)iptr->in[j];
    }
  }

  int jj;
  int nn0, nn1;
  for(nn0 = nn1 = 0, j = 0, jptr = pred_mem, nbptr = nbptr0; j < nj; j += JPARA, jptr++, nbptr++){
    for(jj = 0; jj < JPARA; jj++)
      if(nbptr->flag[jj] == 1.0){
	nbl[i][nn0] = (int)jptr->indx[jj];
	++nn0;
      }
    for(jj = 4; jj < JPARA + 4; jj++)
      if(nbptr->flag[jj] == 1.0){
	nbl[i+1][nn1] = (int)jptr->indx[jj];
	++nn1;
      }
  }

  if(nn0 > MAXLEN || nn1 > MAXLEN)
    nblerror = 1;

  nblen[i]   = nn0;
  nblen[i+1] = nn1;

  free(iptr);

  return;
}
Exemplo n.º 5
0
void gravity_kernel(int nj, pPrdPosVel posvel, pNewAccJrk accjerk)
{
  int ret;
  int j;
  pPred_Mem jptr = pred_mem;
  pIparticle iptr;

  ret = posix_memalign((void **)&iptr, 32, NVAR_IP * 32);
  assert(ret == 0);

  VBROADCASTSD(posvel[0].xpos, YMM00);
  VBROADCASTSD(posvel[0].ypos, YMM01);
  VBROADCASTSD(posvel[0].zpos, YMM02);

  VBROADCASTSD(posvel[1].xpos, YMM03);
  VBROADCASTSD(posvel[1].ypos, YMM04);
  VBROADCASTSD(posvel[1].zpos, YMM05);

  VBROADCASTSS(posvel[0].xvel, XMM06);
  VBROADCASTSS(posvel[1].xvel, XMM07);
  VMERGE(YMM06, YMM07, YMM06);

  VBROADCASTSS(posvel[0].yvel, XMM08);
  VBROADCASTSS(posvel[1].yvel, XMM09);
  VMERGE(YMM08, YMM09, YMM07);

  VBROADCASTSS(posvel[0].zvel, XMM10);
  VBROADCASTSS(posvel[1].zvel, XMM11);
  VMERGE(YMM10, YMM11, YMM08);

  VBROADCASTSS(posvel[0].id, XMM12);
  VBROADCASTSS(posvel[1].id, XMM13);
  VMERGE(YMM12, YMM13, YMM09);

  VBROADCASTSS(posvel[0].eps2, XMM14);
  VBROADCASTSS(posvel[1].eps2, XMM15);
  VMERGE(YMM14, YMM15, YMM10);

  VSTORPD(YMM00, iptr->xpos0[0]);
  VSTORPD(YMM01, iptr->ypos0[0]);
  VSTORPD(YMM02, iptr->zpos0[0]);
  VSTORPD(YMM03, iptr->xpos1[0]);
  VSTORPD(YMM04, iptr->ypos1[0]);
  VSTORPD(YMM05, iptr->zpos1[0]);
  VSTORPS(YMM06, iptr->xvel01[0]);
  VSTORPS(YMM07, iptr->yvel01[0]);
  VSTORPS(YMM08, iptr->zvel01[0]);
  VSTORPS(YMM09, iptr->id01[0]);
  VSTORPS(YMM10, iptr->veps2[0]);

  VZEROALL;
  for(j = 0; j < nj; j += JPARA, jptr++){ // if nj % 2 != 0 ATARU
    // dx -> YMM03
    VLOADPD(jptr->xpos[0], YMM00);
    VSUBPD_M(iptr->xpos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->xpos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM03);
    // dy -> YMM04
    VLOADPD(jptr->ypos[0], YMM00);
    VSUBPD_M(iptr->ypos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->ypos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM04);
    // dz -> YMM05
    VLOADPD(jptr->zpos[0], YMM00);
    VSUBPD_M(iptr->zpos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->zpos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM05);
    // dr^2
    VLOADPS(iptr->veps2[0], YMM01);
    VFMADDPS(YMM01, YMM03, YMM03);
    VFMADDPS(YMM01, YMM04, YMM04);
    VFMADDPS(YMM01, YMM05, YMM05);
    // - 2 / r -> YMM01
    VRSQRTPS(YMM01, YMM02);
    VMULPS(YMM02, YMM01, YMM01);
    VFMSUB213PS_M(three[0], YMM02, YMM01);
    VMULPS(YMM02, YMM01, YMM01);
    // mask
    VLOADPS(jptr->indx[0], YMM02);
    VLOADPS(iptr->id01[0], YMM00);
    VCMPNEQPS(YMM00, YMM02, YMM02);
    VANDPS(YMM02, YMM01, YMM01);    
    // potential
    VMULPS_M(jptr->mass[0], YMM01, YMM02);
    VCVTPS2PD(XMM02, YMM00);
    VUP2LOW(YMM02, XMM06);
    VCVTPS2PD(XMM06, YMM06);
    VHADDPD(YMM06, YMM00, YMM07);
    VADDPD(YMM07, YMM09, YMM09);
    // dvx, dvy, dvz (vj - vi)
    VLOADPS(jptr->xvel[0], YMM06);
    VSUBPS_M(iptr->xvel01[0], YMM06, YMM06);
    VLOADPS(jptr->yvel[0], YMM07);
    VSUBPS_M(iptr->yvel01[0], YMM07, YMM07);
    VLOADPS(jptr->zvel[0], YMM08);
    VSUBPS_M(iptr->zvel01[0], YMM08, YMM08);
    // xv -> YMM00
    VMULPS(YMM03, YMM06, YMM00);
    VFMADDPS(YMM00, YMM04, YMM07);
    VFMADDPS(YMM00, YMM05, YMM08);
    // YMM00: 3.0 * xv / r^2, YMM02: - m / r^3
    VMULPS_M(jptr->mass[0], YMM01, YMM02);
    VMULPS(YMM01, YMM01, YMM01);
    VMULPS(YMM01, YMM00, YMM00);
    VMULPS(YMM01, YMM02, YMM02);
    VMULPS_M(threefourth[0], YMM00, YMM00);
    // prefetch
    PREFETCH((jptr+1)->xpos[0]);
    PREFETCH((jptr+1)->zpos[0]);
    PREFETCH((jptr+1)->mass[0]);
    PREFETCH((jptr+1)->yvel[0]);
    // jx1, jy1, jz1
    VFMADDPS(YMM13, YMM02, YMM06);
    VFMADDPS(YMM14, YMM02, YMM07);
    VFMADDPS(YMM15, YMM02, YMM08);
    // ax
    VMULPS(YMM02, YMM03, YMM03);
    VCVTPS2PD(XMM03, YMM06);
    VUP2LOW(YMM03, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM10, YMM10);
    // ay
    VMULPS(YMM02, YMM04, YMM04);
    VCVTPS2PD(XMM04, YMM06);
    VUP2LOW(YMM04, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM11, YMM11);
    // az
    VMULPS(YMM02, YMM05, YMM05);
    VCVTPS2PD(XMM05, YMM06);
    VUP2LOW(YMM05, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM12, YMM12);
    // jx2, jy2, jz2
    VFNMADDPS(YMM13, YMM00, YMM03);
    VFNMADDPS(YMM14, YMM00, YMM04);
    VFNMADDPS(YMM15, YMM00, YMM05);
  }

  VSTORPD(YMM09, iptr->pot[0]);
  VSTORPD(YMM10, iptr->xacc[0]);
  VSTORPD(YMM11, iptr->yacc[0]);
  VSTORPD(YMM12, iptr->zacc[0]);
  VSTORPS(YMM13, iptr->xjrk[0]);
  VSTORPS(YMM14, iptr->yjrk[0]);
  VSTORPS(YMM15, iptr->zjrk[0]);

  VZEROUPPER;

  accjerk[0].xacc = iptr->xacc[0] + iptr->xacc[2];
  accjerk[0].yacc = iptr->yacc[0] + iptr->yacc[2];
  accjerk[0].zacc = iptr->zacc[0] + iptr->zacc[2];
  accjerk[0].pot  = iptr->pot[0]  + iptr->pot[2];
  accjerk[0].xjrk = iptr->xjrk[0] + iptr->xjrk[1] + iptr->xjrk[2] + iptr->xjrk[3];
  accjerk[0].yjrk = iptr->yjrk[0] + iptr->yjrk[1] + iptr->yjrk[2] + iptr->yjrk[3];
  accjerk[0].zjrk = iptr->zjrk[0] + iptr->zjrk[1] + iptr->zjrk[2] + iptr->zjrk[3];

  accjerk[1].xacc = iptr->xacc[1] + iptr->xacc[3];
  accjerk[1].yacc = iptr->yacc[1] + iptr->yacc[3];
  accjerk[1].zacc = iptr->zacc[1] + iptr->zacc[3];
  accjerk[1].pot  = iptr->pot[1]  + iptr->pot[3];
  accjerk[1].xjrk = iptr->xjrk[4] + iptr->xjrk[5] + iptr->xjrk[6] + iptr->xjrk[7];
  accjerk[1].yjrk = iptr->yjrk[4] + iptr->yjrk[5] + iptr->yjrk[6] + iptr->yjrk[7];
  accjerk[1].zjrk = iptr->zjrk[4] + iptr->zjrk[5] + iptr->zjrk[6] + iptr->zjrk[7];

  free(iptr);

  return;
}
Exemplo n.º 6
0
runtime·mallocgc ( uintptr size , uintptr typ , uint32 flag ) 
{ 
int32 sizeclass; 
uintptr tinysize , size1; 
intgo rate; 
MCache *c; 
MSpan *s; 
MLink *v , *next; 
byte *tiny; 
#line 49 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
if ( size == 0 ) { 
#line 53 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
return &runtime·zerobase; 
} 
if ( m->mallocing ) 
runtime·throw ( "malloc/free - deadlock" ) ; 
#line 59 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
m->locks++; 
m->mallocing = 1; 
#line 62 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
if ( DebugTypeAtBlockEnd ) 
size += sizeof ( uintptr ) ; 
#line 65 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
c = m->mcache; 
if ( !runtime·debug.efence && size <= MaxSmallSize ) { 
if ( ( flag& ( FlagNoScan|FlagNoGC ) ) == FlagNoScan && size < TinySize ) { 
#line 98 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
tinysize = c->tinysize; 
if ( size <= tinysize ) { 
tiny = c->tiny; 
#line 102 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
if ( ( size&7 ) == 0 ) 
tiny = ( byte* ) ROUND ( ( uintptr ) tiny , 8 ) ; 
else if ( ( size&3 ) == 0 ) 
tiny = ( byte* ) ROUND ( ( uintptr ) tiny , 4 ) ; 
else if ( ( size&1 ) == 0 ) 
tiny = ( byte* ) ROUND ( ( uintptr ) tiny , 2 ) ; 
size1 = size + ( tiny - c->tiny ) ; 
if ( size1 <= tinysize ) { 
#line 111 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
v = ( MLink* ) tiny; 
c->tiny += size1; 
c->tinysize -= size1; 
m->mallocing = 0; 
m->locks--; 
if ( m->locks == 0 && g->preempt ) 
g->stackguard0 = StackPreempt; 
return v; 
} 
} 
#line 122 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
s = c->alloc[TinySizeClass]; 
if ( s->freelist == nil ) 
s = runtime·MCache_Refill ( c , TinySizeClass ) ; 
v = s->freelist; 
next = v->next; 
s->freelist = next; 
s->ref++; 
if ( next != nil ) 
PREFETCH ( next ) ; 
( ( uint64* ) v ) [0] = 0; 
( ( uint64* ) v ) [1] = 0; 
#line 135 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
if ( TinySize-size > tinysize ) { 
c->tiny = ( byte* ) v + size; 
c->tinysize = TinySize - size; 
} 
size = TinySize; 
goto done; 
} 
#line 144 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
if ( size <= 1024-8 ) 
sizeclass = runtime·size_to_class8[ ( size+7 ) >>3]; 
else 
sizeclass = runtime·size_to_class128[ ( size-1024+127 ) >> 7]; 
size = runtime·class_to_size[sizeclass]; 
s = c->alloc[sizeclass]; 
if ( s->freelist == nil ) 
s = runtime·MCache_Refill ( c , sizeclass ) ; 
v = s->freelist; 
next = v->next; 
s->freelist = next; 
s->ref++; 
if ( next != nil ) 
PREFETCH ( next ) ; 
if ( ! ( flag & FlagNoZero ) ) { 
v->next = nil; 
#line 161 "/home/14/ren/source/golang/go/src/pkg/runtime/malloc.goc"
if ( size > 2*sizeof ( uintptr ) && ( ( uintptr* ) v ) [1] != 0 ) 
runtime·memclr ( ( byte* ) v , size ) ; 
} 
done: 
c->local_cachealloc += size; 
} else { 
//  Returns hits with _AT_MOST_ numMismatches mistakes.
bool
positionDB::getUpToNMismatches(uint64   mer,
                               uint32   numMismatches,
                               uint64*& posn,
                               uint64&  posnMax,
                               uint64&  posnLen) {

  PREFETCH(_hashedErrors);  //  Slightly better.

  posnLen = 0;

  if (_hashedErrors == 0L) {
    fprintf(stderr, "ERROR:  Nobody initialized getUpToNMismatches() by calling setUpMismatchMatcher().\n");
    exit(1);
  }

  if (posnMax == 0) {
    posnMax = 16384;
    try {
      posn    = new uint64 [posnMax];
    } catch (...) {
      fprintf(stderr, "positionDB::getUpToNMismatches()-- Can't allocate space for initial positions, requested "uint64FMT" uint64's.\n", posnMax);
      abort();
    }
  }

  uint64  orig = HASH(mer);

  //  Optimization that didn't work.  The idea was to compute all the
  //  hashes with errors, then sort to gain better cache locality in
  //  the lookups.  The sort dominated.
  //
  //  Another: Surprisingly, theq two getDecodedValue calls are faster
  //  than a single getDecodedValues.

  for (uint32 e=0; e<_hashedErrorsLen; e++) {
    uint64 hash = orig ^ _hashedErrors[e];
    uint64 st, ed;

    if (_hashTable_BP) {
      st = getDecodedValue(_hashTable_BP, hash * _hashWidth,              _hashWidth);
      ed = getDecodedValue(_hashTable_BP, hash * _hashWidth + _hashWidth, _hashWidth);
    } else {
      st = _hashTable_FW[hash];
      ed = _hashTable_FW[hash+1];
    }

    assert((_hashedErrors[e] & ~_hashMask) == 0);
    assert((hash             & ~_hashMask) == 0);

    //  Rebuild the mer from the hash and its check code.
    //
    //  Compare the rebuilt mer and the original mer -- if there are
    //  exactly N errors, it's a hit!  (if there are fewer than N,
    //  we'll find it when we look for N-1 errors).
    //
    //  Before rebuilding, compute diffs on the chckBits only -- if
    //  things are wildly different (the usual case) we'll get
    //  enough difference here to abort.  Remember, the chck bits
    //  are not encoded, they're an exact copy from the unhashed
    //  mer.

    if (st != ed) {
      for (uint64 i=ed-st, J=st * _wFin; i--; J += _wFin) {
        uint64 chck  = getDecodedValue(_buckets, J, _chckWidth);
        uint64 diffs = chck ^ (mer & _mask2);
        uint64 d1    = diffs & uint64NUMBER(0x5555555555555555);
        uint64 d2    = diffs & uint64NUMBER(0xaaaaaaaaaaaaaaaa);
        uint64 err   = countNumberOfSetBits64(d1 | (d2 >> 1));

        if (err <= numMismatches) {
          diffs = REBUILD(hash, chck) ^ mer;
          d1    = diffs & uint64NUMBER(0x5555555555555555);
          d2    = diffs & uint64NUMBER(0xaaaaaaaaaaaaaaaa);
          err   = countNumberOfSetBits64(d1 | (d2 >> 1));

          if (err <= numMismatches)
            //  err is junk, just need a parameter here
            loadPositions(J, posn, posnMax, posnLen, err);
        }
      }
    }
  }
Exemplo n.º 8
0
void gravity_kernel2(int nj, pPrdPosVel posvel, pNewAccJrk accjerk)
{
  int ret;
  int j;
  double true_rmin2;
  pPred_Mem jptr = pred_mem;
  pIparticle iptr;
  float ten = 10.0, minusone = -1.0;

  ret = posix_memalign((void **)&iptr, 32, NVAR_IP * 32);
  assert(ret == 0);

  VBROADCASTSD(posvel[0].xpos, YMM00);
  VBROADCASTSD(posvel[0].ypos, YMM01);
  VBROADCASTSD(posvel[0].zpos, YMM02);

  VBROADCASTSD(posvel[1].xpos, YMM03);
  VBROADCASTSD(posvel[1].ypos, YMM04);
  VBROADCASTSD(posvel[1].zpos, YMM05);

  VBROADCASTSS(posvel[0].xvel, XMM06);
  VBROADCASTSS(posvel[1].xvel, XMM07);
  VMERGE(YMM06, YMM07, YMM06);

  VBROADCASTSS(posvel[0].yvel, XMM08);
  VBROADCASTSS(posvel[1].yvel, XMM09);
  VMERGE(YMM08, YMM09, YMM07);

  VBROADCASTSS(posvel[0].zvel, XMM10);
  VBROADCASTSS(posvel[1].zvel, XMM11);
  VMERGE(YMM10, YMM11, YMM08);

  VBROADCASTSS(posvel[0].id, XMM12);
  VBROADCASTSS(posvel[1].id, XMM13);
  VMERGE(YMM12, YMM13, YMM09);

  VBROADCASTSS(posvel[0].eps2, XMM14);
  VBROADCASTSS(posvel[1].eps2, XMM15);
  VMERGE(YMM14, YMM15, YMM10);

  VBROADCASTSS(ten, YMM11);
  VBROADCASTSS(minusone, YMM12);

  VSTORPD(YMM00, iptr->xpos0[0]);
  VSTORPD(YMM01, iptr->ypos0[0]);
  VSTORPD(YMM02, iptr->zpos0[0]);
  VSTORPD(YMM03, iptr->xpos1[0]);
  VSTORPD(YMM04, iptr->ypos1[0]);
  VSTORPD(YMM05, iptr->zpos1[0]);
  VSTORPS(YMM06, iptr->xvel01[0]);
  VSTORPS(YMM07, iptr->yvel01[0]);
  VSTORPS(YMM08, iptr->zvel01[0]);
  VSTORPS(YMM09, iptr->id01[0]);
  VSTORPS(YMM10, iptr->veps2[0]);
  VSTORPS(YMM11, iptr->rmin2[0]);
  VSTORPS(YMM12, iptr->in[0]);

  VZEROALL;
  for(j = 0; j < nj; j += JPARA, jptr++){ // if nj % 2 != 0 ATARU
    // dx -> YMM03
    VLOADPD(jptr->xpos[0], YMM00);
    VSUBPD_M(iptr->xpos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->xpos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM03);
    // dy -> YMM04
    VLOADPD(jptr->ypos[0], YMM00);
    VSUBPD_M(iptr->ypos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->ypos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM04);
    // dz -> YMM05
    VLOADPD(jptr->zpos[0], YMM00);
    VSUBPD_M(iptr->zpos0[0], YMM00, YMM01);
    VCVTPD2PS(YMM01, XMM01);
    VSUBPD_M(iptr->zpos1[0], YMM00, YMM02);
    VCVTPD2PS(YMM02, XMM02);
    VMERGE(YMM01, YMM02, YMM05);
    // dr^2
    VLOADPS(iptr->veps2[0], YMM01);
    VMULPS(YMM03, YMM03, YMM00);
    VADDPS(YMM00, YMM01, YMM01);
    VMULPS(YMM04, YMM04, YMM00);
    VADDPS(YMM00, YMM01, YMM01);
    VMULPS(YMM05, YMM05, YMM00);
    VADDPS(YMM00, YMM01, YMM01);
    // - 2 / r -> YMM01
    VRSQRTPS(YMM01, YMM02);
    VMULPS(YMM02, YMM01, YMM01);
    VMULPS(YMM02, YMM01, YMM01);
    VSUBPS_M(three[0], YMM01, YMM01);
    VMULPS(YMM02, YMM01, YMM01);
    // mask
    VLOADPS(jptr->indx[0], YMM02);
    VLOADPS(iptr->id01[0], YMM00);
    VCMPNEQPS(YMM00, YMM02, YMM02);
    VANDPS(YMM02, YMM01, YMM01);    
    // nearest neighbour (free: YMM00, YMM02, YMM06, YMM07, YMM08)
    VLOADPS(iptr->rmin2[0], YMM00);
    VMINPS(YMM01, YMM00, YMM02);
    VSTORPS(YMM02, iptr->rmin2[0]);
    VCMPPS(YMM01, YMM00, YMM02, GT);
    VLOADPS(jptr->indx[0], YMM06);
    VANDPS(YMM02, YMM06, YMM07);
    VCMPPS(YMM01, YMM00, YMM08, LE);
    VANDPS_M(iptr->in[0], YMM08, YMM08);
    VADDPS(YMM08, YMM07, YMM07);
    VSTORPS(YMM07, iptr->in[0]);
    // potential
    VMULPS_M(jptr->mass[0], YMM01, YMM02);
    VCVTPS2PD(XMM02, YMM00);
    VUP2LOW(YMM02, XMM06);
    VCVTPS2PD(XMM06, YMM06);
    VHADDPD(YMM06, YMM00, YMM07);
    VADDPD(YMM07, YMM09, YMM09);
    // dvx, dvy, dvz (vj - vi)
    VLOADPS(jptr->xvel[0], YMM06);
    VSUBPS_M(iptr->xvel01[0], YMM06, YMM06);
    VLOADPS(jptr->yvel[0], YMM07);
    VSUBPS_M(iptr->yvel01[0], YMM07, YMM07);
    VLOADPS(jptr->zvel[0], YMM08);
    VSUBPS_M(iptr->zvel01[0], YMM08, YMM08);
    // xv -> YMM00
    VMULPS(YMM03, YMM06, YMM00);
    VMULPS(YMM04, YMM07, YMM02);
    VADDPS(YMM02, YMM00, YMM00);
    VMULPS(YMM05, YMM08, YMM02);
    VADDPS(YMM02, YMM00, YMM00);
    // YMM00: 3.0 * xv / r^2, YMM02: - m / r^3
    VMULPS_M(jptr->mass[0], YMM01, YMM02);
    VMULPS(YMM01, YMM01, YMM01);
    VMULPS(YMM01, YMM00, YMM00);
    VMULPS(YMM01, YMM02, YMM02);
    VMULPS_M(threefourth[0], YMM00, YMM00);
    // prefetch
    PREFETCH((jptr+1)->xpos[0]);
    PREFETCH((jptr+1)->zpos[0]);
    PREFETCH((jptr+1)->mass[0]);
    PREFETCH((jptr+1)->yvel[0]);
    // jx1
    VMULPS(YMM02, YMM06, YMM06);
    VADDPS(YMM06, YMM13, YMM13);
    // jy1
    VMULPS(YMM02, YMM07, YMM07);
    VADDPS(YMM07, YMM14, YMM14);
    // jz1
    VMULPS(YMM02, YMM08, YMM08);
    VADDPS(YMM08, YMM15, YMM15);
    // ax
    VMULPS(YMM02, YMM03, YMM03);
    VCVTPS2PD(XMM03, YMM06);
    VUP2LOW(YMM03, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM10, YMM10);
    // ay
    VMULPS(YMM02, YMM04, YMM04);
    VCVTPS2PD(XMM04, YMM06);
    VUP2LOW(YMM04, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM11, YMM11);
    // az
    VMULPS(YMM02, YMM05, YMM05);
    VCVTPS2PD(XMM05, YMM06);
    VUP2LOW(YMM05, XMM07);
    VCVTPS2PD(XMM07, YMM07);
    VHADDPD(YMM07, YMM06, YMM06);
    VADDPD(YMM06, YMM12, YMM12);
    // jx2
    VMULPS(YMM00, YMM03, YMM03);
    VSUBPS(YMM03, YMM13, YMM13);
    // jy2
    VMULPS(YMM00, YMM04, YMM04);
    VSUBPS(YMM04, YMM14, YMM14);
    // jz2
    VMULPS(YMM00, YMM05, YMM05);
    VSUBPS(YMM05, YMM15, YMM15);
  }

  VSTORPD(YMM09, iptr->pot[0]);
  VSTORPD(YMM10, iptr->xacc[0]);
  VSTORPD(YMM11, iptr->yacc[0]);
  VSTORPD(YMM12, iptr->zacc[0]);
  VSTORPS(YMM13, iptr->xjrk[0]);
  VSTORPS(YMM14, iptr->yjrk[0]);
  VSTORPS(YMM15, iptr->zjrk[0]);

  accjerk[0].xacc = iptr->xacc[0] + iptr->xacc[2];
  accjerk[0].yacc = iptr->yacc[0] + iptr->yacc[2];
  accjerk[0].zacc = iptr->zacc[0] + iptr->zacc[2];
  accjerk[0].pot  = iptr->pot[0]  + iptr->pot[2];
  accjerk[0].xjrk = iptr->xjrk[0] + iptr->xjrk[1] + iptr->xjrk[2] + iptr->xjrk[3];
  accjerk[0].yjrk = iptr->yjrk[0] + iptr->yjrk[1] + iptr->yjrk[2] + iptr->yjrk[3];
  accjerk[0].zjrk = iptr->zjrk[0] + iptr->zjrk[1] + iptr->zjrk[2] + iptr->zjrk[3];
  for(true_rmin2 = 1e30, j = 0; j < JPARA; j++){
    if(iptr->rmin2[j] < true_rmin2){
      true_rmin2    = iptr->rmin2[j];
      accjerk[0].rnnb = - 2.0 / true_rmin2;
      accjerk[0].nnb  = (int)iptr->in[j];
    }
  }

  accjerk[1].xacc = iptr->xacc[1] + iptr->xacc[3];
  accjerk[1].yacc = iptr->yacc[1] + iptr->yacc[3];
  accjerk[1].zacc = iptr->zacc[1] + iptr->zacc[3];
  accjerk[1].pot  = iptr->pot[1]  + iptr->pot[3];
  accjerk[1].xjrk = iptr->xjrk[4] + iptr->xjrk[5] + iptr->xjrk[6] + iptr->xjrk[7];
  accjerk[1].yjrk = iptr->yjrk[4] + iptr->yjrk[5] + iptr->yjrk[6] + iptr->yjrk[7];
  accjerk[1].zjrk = iptr->zjrk[4] + iptr->zjrk[5] + iptr->zjrk[6] + iptr->zjrk[7];
  for(true_rmin2 = 1e30, j = 4; j < 4 + JPARA; j++){
    if(iptr->rmin2[j] < true_rmin2){
      true_rmin2    = iptr->rmin2[j];
      accjerk[1].rnnb = - 2.0 / true_rmin2;
      accjerk[1].nnb = (int)iptr->in[j];
    }
  }

  free(iptr);

  return;
}
Exemplo n.º 9
0
static void collector_trace_rootsets(Collector* collector)
{
  GC* gc = collector->gc;
  GC_Metadata* metadata = gc->metadata;
#ifdef GC_GEN_STATS
  GC_Gen_Collector_Stats* stats = (GC_Gen_Collector_Stats*)collector->stats;
#endif
  
  unsigned int num_active_collectors = gc->num_active_collectors;
  atomic_cas32( &num_finished_collectors, 0, num_active_collectors);

  Space* space = collector->collect_space;
  collector->trace_stack = free_task_pool_get_entry(metadata);

  /* find root slots saved by 1. active mutators, 2. exited mutators, 3. last cycle collectors */  
  Vector_Block* root_set = pool_iterator_next(metadata->gc_rootset_pool);

  /* first step: copy all root objects to trace tasks. */ 

  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: copy root objects to trace stack ...");
  while(root_set){
    POINTER_SIZE_INT* iter = vector_block_iterator_init(root_set);
    while(!vector_block_iterator_end(root_set,iter)){
      REF *p_ref = (REF *)*iter;
      iter = vector_block_iterator_advance(root_set, iter);

      assert(*p_ref);  /* root ref cann't be NULL, but remset can be */

      collector_tracestack_push(collector, p_ref);

#ifdef GC_GEN_STATS    
      gc_gen_collector_update_rootset_ref_num(stats);
#endif
    } 
    root_set = pool_iterator_next(metadata->gc_rootset_pool);
  }
  /* put back the last trace_stack task */    
  pool_put_entry(metadata->mark_task_pool, collector->trace_stack);
  
  /* second step: iterate over the trace tasks and forward objects */
  collector->trace_stack = free_task_pool_get_entry(metadata);

  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish copying root objects to trace stack.");

  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: trace and forward objects ...");

retry:
  Vector_Block* trace_task = pool_get_entry(metadata->mark_task_pool);

  while(trace_task){    
    POINTER_SIZE_INT* iter = vector_block_iterator_init(trace_task);
    while(!vector_block_iterator_end(trace_task,iter)){
      REF *p_ref = (REF *)*iter;
      iter = vector_block_iterator_advance(trace_task, iter);
#ifdef PREFETCH_SUPPORTED
      /* DO PREFETCH */
      if( mark_prefetch ) {    
        if(!vector_block_iterator_end(trace_task, iter)) {
      	  REF *pref= (REF*) *iter;
      	  PREFETCH( read_slot(pref));
        }	
      }
#endif      
      trace_object(collector, p_ref);
      
      if(collector->result == FALSE)  break; /* force return */
 
    }
    vector_stack_clear(trace_task);
    pool_put_entry(metadata->free_task_pool, trace_task);

    if(collector->result == FALSE){
      gc_task_pool_clear(metadata->mark_task_pool);
      break; /* force return */
    }
    
    trace_task = pool_get_entry(metadata->mark_task_pool);
  }
  
  /* A collector comes here when seeing an empty mark_task_pool. The last collector will ensure 
     all the tasks are finished.*/
     
  atomic_inc32(&num_finished_collectors);
  while(num_finished_collectors != num_active_collectors){
    if( pool_is_empty(metadata->mark_task_pool)) continue;
    /* we can't grab the task here, because of a race condition. If we grab the task, 
       and the pool is empty, other threads may fall to this barrier and then pass. */
    atomic_dec32(&num_finished_collectors);
    goto retry; 
  }

  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish tracing and forwarding objects.");

  /* now we are done, but each collector has a private stack that is empty */  
  trace_task = (Vector_Block*)collector->trace_stack;
  vector_stack_clear(trace_task);
  pool_put_entry(metadata->free_task_pool, trace_task);   
  collector->trace_stack = NULL;
  
  return;
}
Exemplo n.º 10
0
static void collector_trace_rootsets(Collector* collector)
{
  GC* gc = collector->gc;
  GC_Metadata* metadata = gc->metadata;
#ifdef GC_GEN_STATS
  GC_Gen_Collector_Stats* stats = (GC_Gen_Collector_Stats*)collector->stats;
#endif
  
  unsigned int num_active_collectors = gc->num_active_collectors;
  atomic_cas32( &num_finished_collectors, 0, num_active_collectors);

  Space* space = collector->collect_space;
  collector->trace_stack = free_task_pool_get_entry(metadata);

  /* find root slots saved by 1. active mutators, 2. exited mutators, 3. last cycle collectors */  
  Vector_Block* root_set = pool_iterator_next(metadata->gc_rootset_pool);

  /* first step: copy all root objects to trace tasks. */ 

  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: copy root objects to trace stack ......");
  while(root_set){
    POINTER_SIZE_INT* iter = vector_block_iterator_init(root_set);
    while(!vector_block_iterator_end(root_set,iter)){
      REF *p_ref = (REF *)*iter;
      iter = vector_block_iterator_advance(root_set,iter);
      
      if(!*p_ref) continue;  /* root ref cann't be NULL, but remset can be */
      Partial_Reveal_Object *p_obj = read_slot(p_ref);

#ifdef GC_GEN_STATS
      gc_gen_collector_update_rootset_ref_num(stats);
#endif

      if(obj_belongs_to_nos(p_obj)){
        collector_tracestack_push(collector, p_ref);
      }
    } 
    root_set = pool_iterator_next(metadata->gc_rootset_pool);
  }
  /* put back the last trace_stack task */    
  pool_put_entry(metadata->mark_task_pool, collector->trace_stack);
  
  /* second step: iterate over the trace tasks and forward objects */
  collector->trace_stack = free_task_pool_get_entry(metadata);

  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish copying root objects to trace stack.");

  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: trace and forward objects ......");

retry:
  Vector_Block* trace_task = pool_get_entry(metadata->mark_task_pool);

  while(trace_task){    
    POINTER_SIZE_INT* iter = vector_block_iterator_init(trace_task);
    while(!vector_block_iterator_end(trace_task,iter)){
      REF *p_ref = (REF *)*iter;
      iter = vector_block_iterator_advance(trace_task,iter);
      assert(*p_ref); /* a task can't be NULL, it was checked before put into the task stack */
#ifdef PREFETCH_SUPPORTED      
      /* DO PREFETCH */  
      if( mark_prefetch ) {    
        if(!vector_block_iterator_end(trace_task, iter)) {
      	  REF *pref= (REF*) *iter;
      	  PREFETCH( read_slot(pref));
        }	
      }
#endif            
      /* in sequential version, we only trace same object once, but we were using a local hashset for that,
         which couldn't catch the repetition between multiple collectors. This is subject to more study. */
   
      /* FIXME:: we should not let root_set empty during working, other may want to steal it. 
         degenerate my stack into root_set, and grab another stack */
   
      /* a task has to belong to collected space, it was checked before put into the stack */
      trace_object(collector, p_ref);
      if(collector->result == FALSE)  break; /* force return */
    }
    vector_stack_clear(trace_task);
    pool_put_entry(metadata->free_task_pool, trace_task);
    if(collector->result == FALSE){
      gc_task_pool_clear(metadata->mark_task_pool);
      break; /* force return */
    }

    trace_task = pool_get_entry(metadata->mark_task_pool);
  }
  
  atomic_inc32(&num_finished_collectors);
  while(num_finished_collectors != num_active_collectors){
    if( pool_is_empty(metadata->mark_task_pool)) continue;
    /* we can't grab the task here, because of a race condition. If we grab the task, 
       and the pool is empty, other threads may fall to this barrier and then pass. */
    atomic_dec32(&num_finished_collectors);
    goto retry;      
  }
  TRACE2("gc.process", "GC: collector["<<((POINTER_SIZE_INT)collector->thread_handle)<<"]: finish tracing and forwarding objects.");

  /* now we are done, but each collector has a private stack that is empty */  
  trace_task = (Vector_Block*)collector->trace_stack;
  vector_stack_clear(trace_task);
  pool_put_entry(metadata->free_task_pool, trace_task);   
  collector->trace_stack = NULL;
  
  return;
}
Exemplo n.º 11
0
void GravityKernel0(pIpdata ipdata, pFodata fodata, pJpdata0 jpdata, int nj)
{
  int j;

  PREFETCH(jpdata[0]);

  VZEROALL;
  VLOADPS(*ipdata->x, XMM04);
  VLOADPS(*ipdata->y, XMM05);
  VLOADPS(*ipdata->z, XMM06);
  VLOADPS(*ipdata->eps2, XMM15);
  VPERM2F128(XI, XI, XI, 0x00);
  VPERM2F128(YI, YI, YI, 0x00);
  VPERM2F128(ZI, ZI, ZI, 0x00);
  VPERM2F128(EPSI2, EPSI2, EPSI2, 0x00);

  VLOADPS(jpdata->xm[0][0], Z2);
  VADDPS_M(jpdata->ep[0][0], EPSI2, EPSJ2);
  jpdata++;

  VSHUFPS(Z2, Z2, X2, 0x00);
  VSHUFPS(Z2, Z2, MJ, 0xff);
  VSHUFPS(Z2, Z2, Y2, 0x55);
  VSHUFPS(Z2, Z2, Z2, 0xaa);

  for(j = 0; j < nj; j += 2){

    VSUBPS(XI, X2, DX);
    VSUBPS(ZI, Z2, DZ);
    VSUBPS(YI, Y2, DY);

    VMULPS(DX, DX, X2);
    VMULPS(DZ, DZ, Z2);
    VMULPS(DY, DY, Y2);

    VADDPS(X2, Z2, X2);
    VADDPS(EPSJ2, Y2, Y2);
    VADDPS(X2, Y2, Y2);

    VLOADPS(jpdata->xm[0][0], Z2);
    VADDPS_M(jpdata->ep[0][0], EPSI2, EPSJ2);
    jpdata++;

    VRSQRTPS(Y2, X2);

    VMULPS(X2, MJ, MJ);
    VMULPS(X2, X2, Y2);

    VMULPS(MJ, Y2, Y2);
    VSUBPS(MJ, PHI, PHI);

    VMULPS(Y2, DX, DX);
    VMULPS(Y2, DY, DY);
    VMULPS(Y2, DZ, DZ);

    VSHUFPS(Z2, Z2, X2, 0x00);
    VSHUFPS(Z2, Z2, MJ, 0xff);
    VSHUFPS(Z2, Z2, Y2, 0x55);
    VSHUFPS(Z2, Z2, Z2, 0xaa);

    VADDPS(DX, AX, AX);
    VADDPS(DY, AY, AY);
    VADDPS(DZ, AZ, AZ);

  }

  VEXTRACTF128(AX, XMM00, 0x01);
  VADDPS(AX, YMM00, AX);
  VEXTRACTF128(AY, XMM01, 0x01);
  VADDPS(AY, YMM01, AY);
  VEXTRACTF128(AZ, XMM02, 0x01);
  VADDPS(AZ, YMM02, AZ);
  VEXTRACTF128(PHI, XMM03, 0x01);
  VADDPS(PHI, YMM03, PHI);

  VSTORPS(XMM08,  *fodata->ax);
  VSTORPS(XMM09,  *fodata->ay);
  VSTORPS(XMM10,  *fodata->az);
  VSTORPS(XMM11, *fodata->phi);

}
Exemplo n.º 12
0
void GravityKernel(pIpdata ipdata, pFodata fodata, pJpdata jpdata, int nj)
{
  int j;

  PREFETCH(jpdata[0]);

  VZEROALL;
  VLOADPS(*ipdata->x, XMM04);
  VLOADPS(*ipdata->y, XMM05);
  VLOADPS(*ipdata->z, XMM06);
  VLOADPS(*ipdata->eps2, XMM15);
  VPERM2F128(XI, XI, XI, 0x00);
  VPERM2F128(YI, YI, YI, 0x00);
  VPERM2F128(ZI, ZI, ZI, 0x00);
  VPERM2F128(EPS2, EPS2, EPS2, 0x00);

#if (2 == NUNROLL)
  VLOADPS(*(jpdata), J1);
  jpdata += 2;

  VSHUFPS(J1, J1, X2, 0x00);
  VSHUFPS(J1, J1, J2, 0xaa);
  VSHUFPS(J1, J1, MJ, 0xff);
  VSHUFPS(J1, J1, Y2, 0x55);

  for(j = 0; j < nj; j += 2){

    VSUBPS(XI, X2, DX);
    VSUBPS(ZI, J2, DZ);
    VSUBPS(YI, Y2, DY);

    VLOADPS(*(jpdata), J1);
    jpdata += 2;
    
    VMULPS(DX, DX, X2);
    VMULPS(DZ, DZ, J2);
    VMULPS(DY, DY, Y2);

    VADDPS(X2, J2, J2);
    VADDPS(EPS2, Y2, Y2);
    VADDPS(J2, Y2, Y2);

    VRSQRTPS(Y2, X2);

    VMULPS(X2, MJ, MJ);
    VMULPS(X2, X2, Y2);

    VMULPS(MJ, Y2, Y2);
    VSUBPS(MJ, PHI, PHI);

    VMULPS(Y2, DX, DX);
    VMULPS(Y2, DY, DY);
    VMULPS(Y2, DZ, DZ);

    VSHUFPS(J1, J1, X2, 0x00);
    VSHUFPS(J1, J1, J2, 0xaa);
    VSHUFPS(J1, J1, MJ, 0xff);
    VSHUFPS(J1, J1, Y2, 0x55);

    VADDPS(DX, AX, AX);
    VADDPS(DY, AY, AY);
    VADDPS(DZ, AZ, AZ);
  }
#elif (4 == NUNROLL)
#if 1
  VLOADPS(*(jpdata), J1);
  VLOADPS(*(jpdata+2), J2);

  jpdata += 4;

  VSHUFPS(J1, J1, X2, 0x00);
  VSHUFPS(J1, J1, Y2, 0x55);
  VSHUFPS(J1, J1, MJ, 0xff);
  VSHUFPS(J1, J1, J1, 0xaa);

  for(j = 0 ; j < nj; j += 4) {

    VSUBPS(XI, X2, DX);
    VSUBPS(YI, Y2, DY);
    VSUBPS(ZI, J1, DZ);

    VMULPS(DX, DX, X2);
    VMULPS(DZ, DZ, J1);
    VMULPS(DY, DY, Y2);

    VADDPS(J1, X2, X2);
    VADDPS(EPS2, Y2, Y2);
    VADDPS(Y2, X2, Y2);

    VLOADPS(*(jpdata), J1);

    VRSQRTPS(Y2, X2);

    VMULPS(X2, MJ, MJ);
    VMULPS(X2, X2, Y2);

    VMULPS(MJ, Y2, Y2);
    VSUBPS(MJ, PHI, PHI);

    VMULPS(Y2, DX, DX);
    VMULPS(Y2, DY, DY);
    VMULPS(Y2, DZ, DZ);

    VSHUFPS(J2, J2, X2, 0x00);
    VSHUFPS(J2, J2, MJ, 0xff);
    VSHUFPS(J2, J2, Y2, 0x55);
    VSHUFPS(J2, J2, J2, 0xaa);

    VADDPS(DX, AX, AX);
    VADDPS(DY, AY, AY);
    VADDPS(DZ, AZ, AZ);

    VSUBPS(XI, X2, DX);
    VSUBPS(YI, Y2, DY);
    VSUBPS(ZI, J2, DZ);

    VMULPS(DX, DX, X2);
    VMULPS(DZ, DZ, J2);
    VMULPS(DY, DY, Y2);

    VADDPS(J2, X2, X2);
    VADDPS(EPS2, Y2, Y2);
    VADDPS(Y2, X2, Y2);

    VLOADPS(*(jpdata+2), J2);    

    VRSQRTPS(Y2, X2);

    VMULPS(X2, MJ, MJ);
    VMULPS(X2, X2, Y2);

    jpdata += 4;
    PREFETCH(*(jpdata));

    VMULPS(MJ, Y2, Y2);
    VSUBPS(MJ, PHI, PHI);

    VMULPS(Y2, DX, DX);
    VMULPS(Y2, DY, DY);
    VMULPS(Y2, DZ, DZ);

    VSHUFPS(J1, J1, X2, 0x00);
    VSHUFPS(J1, J1, MJ, 0xff);
    VSHUFPS(J1, J1, Y2, 0x55);
    VSHUFPS(J1, J1, J1, 0xaa);

    VADDPS(DX, AX, AX);
    VADDPS(DY, AY, AY);
    VADDPS(DZ, AZ, AZ);
  }
#else
  VLOADPS(*(jpdata), J1);
  VLOADPS(*(jpdata+2), J2);

  jpdata += 4;

  VSHUFPS(J1, J1, X2, 0x00);
  VSHUFPS(J1, J1, Y2, 0x55);
  VSHUFPS(J1, J1, MJ, 0xaa);
  VSHUFPS(J1, J1, J1, 0xff);

  for(j = 0 ; j < nj; j += 4) {

    VSUBPS(XI, X2, DX);
    VSUBPS(YI, Y2, DY);
    VSUBPS(ZI, MJ, DZ);

    VMULPS(DX, DX, X2);
    VMULPS(DY, DY, Y2);
    VMULPS(DZ, DZ, MJ);

    VADDPS(X2, Y2, Y2);
    VADDPS(EPS2, MJ, MJ);
    VADDPS(Y2, MJ, Y2);

    VRSQRTPS(Y2, X2);

    VMULPS(X2, J1, Y2);
    VMULPS(X2, X2, X2);

    VLOADPS(*(jpdata), J1);

    VSUBPS(Y2, PHI, PHI);
    VMULPS(X2, Y2, Y2);

    VMULPS(Y2, DX, DX);
    VMULPS(Y2, DY, DY);
    VMULPS(Y2, DZ, DZ);

    VSHUFPS(J2, J2, X2, 0x00);
    VSHUFPS(J2, J2, Y2, 0x55);
    VSHUFPS(J2, J2, MJ, 0xaa);
    VSHUFPS(J2, J2, J2, 0xff);

    VADDPS(DX, AX, AX);
    VADDPS(DY, AY, AY);
    VADDPS(DZ, AZ, AZ);

    VSUBPS(XI, X2, DX);
    VSUBPS(YI, Y2, DY);
    VSUBPS(ZI, MJ, DZ);

    VMULPS(DX, DX, X2);
    VMULPS(DY, DY, Y2);
    VMULPS(DZ, DZ, MJ);

    VADDPS(X2, Y2, Y2);
    VADDPS(EPS2, MJ, MJ);
    VADDPS(Y2, MJ, Y2);

    VRSQRTPS(Y2, X2);

    VMULPS(X2, J2, Y2);
    VMULPS(X2, X2, X2);

    VLOADPS(*(jpdata+2), J2);    

    jpdata += 4;
    PREFETCH(*(jpdata));

    VSUBPS(Y2, PHI, PHI);
    VMULPS(X2, Y2, Y2);

    VMULPS(Y2, DX, DX);
    VMULPS(Y2, DY, DY);
    VMULPS(Y2, DZ, DZ);

    VSHUFPS(J1, J1, X2, 0x00);
    VSHUFPS(J1, J1, Y2, 0x55);
    VSHUFPS(J1, J1, MJ, 0xaa);
    VSHUFPS(J1, J1, J1, 0xff);

    VADDPS(DX, AX, AX);
    VADDPS(DY, AY, AY);
    VADDPS(DZ, AZ, AZ);
  }
#endif
#else
#error
#endif

  VEXTRACTF128(AX, XMM00, 0x01);
  VADDPS(AX, YMM00, AX);
  VEXTRACTF128(AY, XMM01, 0x01);
  VADDPS(AY, YMM01, AY);
  VEXTRACTF128(AZ, XMM02, 0x01);
  VADDPS(AZ, YMM02, AZ);
  VEXTRACTF128(PHI, XMM03, 0x01);
  VADDPS(PHI, YMM03, PHI);

  VSTORPS(XMM08,  *fodata->ax);
  VSTORPS(XMM09,  *fodata->ay);
  VSTORPS(XMM10,  *fodata->az);
  VSTORPS(XMM11, *fodata->phi);

}