RETf RCPSQRT( const __m128 x ) { return _mm_rsqrt_ps(x); }
inline __m128 SSENormalizeMultiplierSSE4(__m128 v) { return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF)); }
void PreviewWorker::processCoherent(const WorkUnit *workUnit, WorkResult *workResult, const bool &stop) { #if defined(MTS_HAS_COHERENT_RT) const RectangularWorkUnit *rect = static_cast<const RectangularWorkUnit *>(workUnit); ImageBlock *block = static_cast<ImageBlock *>(workResult); block->setOffset(rect->getOffset()); block->setSize(rect->getSize()); /* Some constants */ const int sx = rect->getOffset().x, sy = block->getOffset().y; const int ex = sx + rect->getSize().x, ey = sy + rect->getSize().y; const int width = rect->getSize().x; const SSEVector MM_ALIGN16 xOffset(0.0f, 1.0f, 0.0f, 1.0f); const SSEVector MM_ALIGN16 yOffset(0.0f, 0.0f, 1.0f, 1.0f); const int pixelOffset[] = {0, 1, width, width+1}; const __m128 clamping = _mm_set1_ps(1/(m_minDist*m_minDist)); uint8_t temp[MTS_KD_INTERSECTION_TEMP*4]; const __m128 camTL[3] = { _mm_set1_ps(m_cameraTL.x), _mm_set1_ps(m_cameraTL.y), _mm_set1_ps(m_cameraTL.z) }; const __m128 camDx[3] = { _mm_set1_ps(m_cameraDx.x), _mm_set1_ps(m_cameraDx.y), _mm_set1_ps(m_cameraDx.z) }; const __m128 camDy[3] = { _mm_set1_ps(m_cameraDy.x), _mm_set1_ps(m_cameraDy.y), _mm_set1_ps(m_cameraDy.z) }; const __m128 lumPos[3] = { _mm_set1_ps(m_vpl.its.p.x), _mm_set1_ps(m_vpl.its.p.y), _mm_set1_ps(m_vpl.its.p.z) }; const __m128 lumDir[3] = { _mm_set1_ps(m_vpl.its.shFrame.n.x), _mm_set1_ps(m_vpl.its.shFrame.n.y), _mm_set1_ps(m_vpl.its.shFrame.n.z) }; /* Some local variables */ int pos = 0; int numRays = 0; RayPacket4 MM_ALIGN16 primRay4, secRay4; Intersection4 MM_ALIGN16 its4, secIts4; RayInterval4 MM_ALIGN16 itv4, secItv4; SSEVector MM_ALIGN16 nSecD[3], cosThetaLight, invLengthSquared; Spectrum emitted[4], direct[4]; Intersection its; Vector wo, wi; its.hasUVPartials = false; bool diffuseVPL = false, vplOnSurface = false; Spectrum vplWeight; if (m_vpl.type == ESurfaceVPL && (m_diffuseSources || m_vpl.its.shape->getBSDF()->getType() == BSDF::EDiffuseReflection)) { diffuseVPL = true; vplOnSurface = true; vplWeight = m_vpl.its.shape->getBSDF()->getDiffuseReflectance(m_vpl.its) * m_vpl.P / M_PI; } else if (m_vpl.type == ELuminaireVPL) { vplOnSurface = m_vpl.luminaire->getType() & Luminaire::EOnSurface; diffuseVPL = m_vpl.luminaire->getType() & Luminaire::EDiffuseDirection; EmissionRecord eRec(m_vpl.luminaire, ShapeSamplingRecord(m_vpl.its.p, m_vpl.its.shFrame.n), m_vpl.its.shFrame.n); vplWeight = m_vpl.P * m_vpl.luminaire->evalDirection(eRec); } primRay4.o[0].ps = _mm_set1_ps(m_cameraO.x); primRay4.o[1].ps = _mm_set1_ps(m_cameraO.y); primRay4.o[2].ps = _mm_set1_ps(m_cameraO.z); secItv4.mint.ps = _mm_set1_ps(ShadowEpsilon); /* Work on 2x2 sub-blocks */ for (int y=sy; y<ey; y += 2, pos += width) { for (int x=sx; x<ex; x += 2, pos += 2) { /* Generate camera rays without normalization */ const __m128 xPixel = _mm_add_ps(xOffset.ps, _mm_set1_ps((float) x)), yPixel = _mm_add_ps(yOffset.ps, _mm_set1_ps((float) y)); primRay4.d[0].ps = _mm_add_ps(camTL[0], _mm_add_ps( _mm_mul_ps(xPixel, camDx[0]), _mm_mul_ps(yPixel, camDy[0]))); primRay4.d[1].ps = _mm_add_ps(camTL[1], _mm_add_ps( _mm_mul_ps(xPixel, camDx[1]), _mm_mul_ps(yPixel, camDy[1]))); primRay4.d[2].ps = _mm_add_ps(camTL[2], _mm_add_ps( _mm_mul_ps(xPixel, camDx[2]), _mm_mul_ps(yPixel, camDy[2]))); primRay4.dRcp[0].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[0].ps); primRay4.dRcp[1].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[1].ps); primRay4.dRcp[2].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[2].ps); /* Ray coherence test */ const int primSignsX = _mm_movemask_ps(primRay4.d[0].ps); const int primSignsY = _mm_movemask_ps(primRay4.d[1].ps); const int primSignsZ = _mm_movemask_ps(primRay4.d[2].ps); const bool primCoherent = (primSignsX == 0 || primSignsX == 0xF) && (primSignsY == 0 || primSignsY == 0xF) && (primSignsZ == 0 || primSignsZ == 0xF); /* Trace the primary rays */ its4.t = SSEConstants::p_inf; if (EXPECT_TAKEN(primCoherent)) { primRay4.signs[0][0] = primSignsX ? 1 : 0; primRay4.signs[1][0] = primSignsY ? 1 : 0; primRay4.signs[2][0] = primSignsZ ? 1 : 0; m_kdtree->rayIntersectPacket(primRay4, itv4, its4, temp); } else { m_kdtree->rayIntersectPacketIncoherent(primRay4, itv4, its4, temp); } numRays += 4; /* Generate secondary rays */ secRay4.o[0].ps = _mm_add_ps(primRay4.o[0].ps, _mm_mul_ps(its4.t.ps, primRay4.d[0].ps)); secRay4.o[1].ps = _mm_add_ps(primRay4.o[1].ps, _mm_mul_ps(its4.t.ps, primRay4.d[1].ps)); secRay4.o[2].ps = _mm_add_ps(primRay4.o[2].ps, _mm_mul_ps(its4.t.ps, primRay4.d[2].ps)); secRay4.d[0].ps = _mm_sub_ps(lumPos[0], secRay4.o[0].ps); secRay4.d[1].ps = _mm_sub_ps(lumPos[1], secRay4.o[1].ps); secRay4.d[2].ps = _mm_sub_ps(lumPos[2], secRay4.o[2].ps); /* Normalization */ const __m128 lengthSquared = _mm_add_ps(_mm_add_ps( _mm_mul_ps(secRay4.d[0].ps, secRay4.d[0].ps), _mm_mul_ps(secRay4.d[1].ps, secRay4.d[1].ps)), _mm_mul_ps(secRay4.d[2].ps, secRay4.d[2].ps)), invLength = _mm_rsqrt_ps(lengthSquared); invLengthSquared.ps = _mm_min_ps(_mm_rcp_ps(lengthSquared), clamping); nSecD[0].ps = _mm_mul_ps(secRay4.d[0].ps, invLength); nSecD[1].ps = _mm_mul_ps(secRay4.d[1].ps, invLength); nSecD[2].ps = _mm_mul_ps(secRay4.d[2].ps, invLength); secRay4.dRcp[0].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[0].ps); secRay4.dRcp[1].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[1].ps); secRay4.dRcp[2].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[2].ps); cosThetaLight.ps = _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_add_ps( _mm_mul_ps(nSecD[0].ps, lumDir[0]), _mm_mul_ps(nSecD[1].ps, lumDir[1])), _mm_mul_ps(nSecD[2].ps, lumDir[2]))); secItv4.maxt.ps = _mm_set1_ps(1-ShadowEpsilon); /* Shading (scalar) --- this is way too much work and should be rewritten to be smarter in special cases */ for (int idx=0; idx<4; ++idx) { if (EXPECT_NOT_TAKEN(its4.t.f[idx] == std::numeric_limits<float>::infinity())) { /* Don't trace a secondary ray */ secItv4.maxt.f[idx] = 0; emitted[idx] = m_scene->LeBackground(Ray( Point(primRay4.o[0].f[idx], primRay4.o[1].f[idx], primRay4.o[2].f[idx]), Vector(primRay4.d[0].f[idx], primRay4.d[1].f[idx], primRay4.d[2].f[idx]), 0.0f )) * m_backgroundScale; memset(&direct[idx], 0, sizeof(Spectrum)); continue; } const unsigned int primIndex = its4.primIndex.i[idx]; const Shape *shape = (*m_shapes)[its4.shapeIndex.i[idx]]; const BSDF *bsdf = shape->getBSDF(); if (EXPECT_NOT_TAKEN(!bsdf)) { memset(&emitted[idx], 0, sizeof(Spectrum)); memset(&direct[idx], 0, sizeof(Spectrum)); continue; } if (EXPECT_TAKEN(primIndex != KNoTriangleFlag)) { const TriMesh *mesh = static_cast<const TriMesh *>(shape); const Triangle &t = mesh->getTriangles()[primIndex]; const Normal *normals = mesh->getVertexNormals(); const Point2 *texcoords = mesh->getVertexTexcoords(); const Spectrum *colors = mesh->getVertexColors(); const TangentSpace * tangents = mesh->getVertexTangents(); const Float beta = its4.u.f[idx], gamma = its4.v.f[idx], alpha = 1.0f - beta - gamma; const uint32_t idx0 = t.idx[0], idx1 = t.idx[1], idx2 = t.idx[2]; if (EXPECT_TAKEN(normals)) { const Normal &n0 = normals[idx0], &n1 = normals[idx1], &n2 = normals[idx2]; its.shFrame.n = normalize(n0 * alpha + n1 * beta + n2 * gamma); } else { const Point *positions = mesh->getVertexPositions(); const Point &p0 = positions[idx0], &p1 = positions[idx1], &p2 = positions[idx2]; Vector sideA = p1 - p0, sideB = p2 - p0; Vector n = cross(sideA, sideB); Float nLengthSqr = n.lengthSquared(); if (nLengthSqr != 0) n /= std::sqrt(nLengthSqr); its.shFrame.n = Normal(n); } if (EXPECT_TAKEN(texcoords)) { const Point2 &t0 = texcoords[idx0], &t1 = texcoords[idx1], &t2 = texcoords[idx2]; its.uv = t0 * alpha + t1 * beta + t2 * gamma; } else { its.uv = Point2(0.0f); } if (EXPECT_NOT_TAKEN(colors)) { const Spectrum &c0 = colors[idx0], &c1 = colors[idx1], &c2 = colors[idx2]; its.color = c0 * alpha + c1 * beta + c2 * gamma; } if (EXPECT_NOT_TAKEN(tangents)) { const TangentSpace &t0 = tangents[idx0], &t1 = tangents[idx1], &t2 = tangents[idx2]; its.dpdu = t0.dpdu * alpha + t1.dpdu * beta + t2.dpdu * gamma; its.dpdv = t0.dpdv * alpha + t1.dpdv * beta + t2.dpdv * gamma; } } else { Ray ray( Point(primRay4.o[0].f[idx], primRay4.o[1].f[idx], primRay4.o[2].f[idx]), Vector(primRay4.d[0].f[idx], primRay4.d[1].f[idx], primRay4.d[2].f[idx]), 0.0f ); its.t = its4.t.f[idx]; shape->fillIntersectionRecord(ray, temp + idx * MTS_KD_INTERSECTION_TEMP + 8, its); bsdf = its.shape->getBSDF(); } wo.x = nSecD[0].f[idx]; wo.y = nSecD[1].f[idx]; wo.z = nSecD[2].f[idx]; if (EXPECT_TAKEN(!shape->isLuminaire())) { memset(&emitted[idx], 0, sizeof(Spectrum)); } else { Vector d(-primRay4.d[0].f[idx], -primRay4.d[1].f[idx], -primRay4.d[2].f[idx]); emitted[idx] = shape->getLuminaire()->Le(ShapeSamplingRecord(its.p, its.shFrame.n), d); } if (EXPECT_TAKEN(bsdf->getType() == BSDF::EDiffuseReflection && diffuseVPL)) { /* Fast path */ direct[idx] = (bsdf->getDiffuseReflectance(its) * vplWeight) * (std::max((Float) 0.0f, dot(wo, its.shFrame.n)) * (vplOnSurface ? (std::max(cosThetaLight.f[idx], (Float) 0.0f) * INV_PI) : INV_PI) * invLengthSquared.f[idx]); } else { wi.x = -primRay4.d[0].f[idx]; wi.y = -primRay4.d[1].f[idx]; wi.z = -primRay4.d[2].f[idx]; its.p.x = secRay4.o[0].f[idx]; its.p.y = secRay4.o[1].f[idx]; its.p.z = secRay4.o[2].f[idx]; if (EXPECT_NOT_TAKEN(bsdf->getType() & BSDF::EAnisotropic)) { its.shFrame.s = normalize(its.dpdu - its.shFrame.n * dot(its.shFrame.n, its.dpdu)); its.shFrame.t = cross(its.shFrame.n, its.shFrame.s); } else { coordinateSystem(its.shFrame.n, its.shFrame.s, its.shFrame.t); } const Float ctLight = cosThetaLight.f[idx]; wi = normalize(wi); its.wi = its.toLocal(wi); wo = its.toLocal(wo); if (!diffuseVPL) { if (m_vpl.type == ESurfaceVPL) { BSDFQueryRecord bRec(m_vpl.its, m_vpl.its.toLocal(wi)); bRec.quantity = EImportance; vplWeight = m_vpl.its.shape->getBSDF()->eval(bRec) * m_vpl.P; } else { EmissionRecord eRec(m_vpl.luminaire, ShapeSamplingRecord(m_vpl.its.p, m_vpl.its.shFrame.n), wi); eRec.type = EmissionRecord::EPreview; vplWeight = m_vpl.luminaire->evalDirection(eRec) * m_vpl.P; } } if (EXPECT_TAKEN(ctLight >= 0)) { direct[idx] = (bsdf->eval(BSDFQueryRecord(its, wo)) * vplWeight * ((vplOnSurface ? std::max(ctLight, (Float) 0.0f) : 1.0f) * invLengthSquared.f[idx])); } else { memset(&direct[idx], 0, sizeof(Spectrum)); } } ++numRays; } /* Shoot the secondary rays */ const int secSignsX = _mm_movemask_ps(secRay4.d[0].ps); const int secSignsY = _mm_movemask_ps(secRay4.d[1].ps); const int secSignsZ = _mm_movemask_ps(secRay4.d[2].ps); const bool secCoherent = (secSignsX == 0 || secSignsX == 0xF) && (secSignsY == 0 || secSignsY == 0xF) && (secSignsZ == 0 || secSignsZ == 0xF); /* Shoot the secondary rays */ secIts4.t = SSEConstants::p_inf; if (EXPECT_TAKEN(secCoherent)) { secRay4.signs[0][0] = secSignsX ? 1 : 0; secRay4.signs[1][0] = secSignsY ? 1 : 0; secRay4.signs[2][0] = secSignsZ ? 1 : 0; m_kdtree->rayIntersectPacket(secRay4, secItv4, secIts4, temp); } else { m_kdtree->rayIntersectPacketIncoherent(secRay4, secItv4, secIts4, temp); } for (int idx=0; idx<4; ++idx) { if (EXPECT_TAKEN(secIts4.t.f[idx] == std::numeric_limits<float>::infinity())) block->setPixel(pos+pixelOffset[idx], direct[idx]+emitted[idx]); else block->setPixel(pos+pixelOffset[idx], emitted[idx]); } } } block->setExtra(numRays); #else Log(EError, "Coherent raytracing support was not compiled into this binary!"); #endif }
void spu_interpreter::FRSQEST(SPUThread& CPU, spu_opcode_t op) { const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); CPU.GPR[op.rt].vf = _mm_rsqrt_ps(_mm_and_ps(CPU.GPR[op.ra].vf, mask)); }
__m128 voodoo (__m128 a) { __m128 x = insn_ABS (a), y = _mm_rsqrt_ps (x); y = _mm_add_ps (_mm_mul_ps (_mm_sub_ps (_mm_setzero_ps(), _mm_sub_ps (_mm_mul_ps (x, _mm_add_ps (_mm_mul_ps (y, y), _mm_setzero_ps())), v_one)), _mm_add_ps (_mm_mul_ps (y, v_half), _mm_setzero_ps())), y); return y; }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
__forceinline __m128 _normalize(__m128 vec) { return _mm_mul_ps(vec, _mm_rsqrt_ps(_mm_dp_ps(vec, vec, 0x7F))); }
test (__m128 s1) { return _mm_rsqrt_ps (s1); }
static inline float rsqrt_fast(const float x) { const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rsqrt_ps(a); return _mm_cvtss_f32(r); }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel430_ia32_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset,tj,tj2,nti; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; const __m128d six = {6.0f,6.0f}; const __m128d twelwe = {12.0f,12.0f}; const __m128i four = _mm_set_epi32(4,4,4,4); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ Vvdwtmp = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); vgb = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); xmm1 = _mm_setzero_pd(); xmm2 = _mm_setzero_pd(); xmm3 = _mm_setzero_pd(); xmm4 = _mm_setzero_pd(); jnr1 = jnr2 = 0; j13 = j23 = 0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); nti = 2*ntype*type[ii]; fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; tj2 = nti+2*type[jnr2]; xmm1 = _mm_loadu_pd(vdwparam+tj); xmm2 = _mm_loadu_pd(vdwparam+tj2); c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); /* Calculate GB table index */ r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijC); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; c6 = _mm_load_sd(vdwparam+tj); c12 = _mm_load_sd(vdwparam+tj+1); /* Calculate GB table index */ r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijC); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* VdW potential */ Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); _mm_storeh_pd(&vdwt,Vvdwtot); Vvdw[ggid] = Vvdw[ggid] + vdwt; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
void lfModifier::ModifyCoord_UnDist_PTLens_SSE (void *data, float *iocoord, int count) { // See "Note about PT-based distortion models" at the top of mod-coord.cpp. /* * If buffer is not aligned, fall back to plain code */ if((uintptr_t)(iocoord) & 0xf) { return ModifyCoord_UnDist_PTLens(data, iocoord, count); } lfCoordDistCallbackData* cddata = (lfCoordDistCallbackData*) data; __m128 a_ = _mm_set_ps1 (cddata->Terms [0]); __m128 b_ = _mm_set_ps1 (cddata->Terms [1]); __m128 c_ = _mm_set_ps1 (cddata->Terms [2]); __m128 cx = _mm_set_ps1 (cddata->centerX); __m128 cy = _mm_set_ps1 (cddata->centerY); __m128 cc = _mm_set_ps1 (cddata->coordinate_correction); __m128 very_small = _mm_set_ps1 (1e-15); __m128 one = _mm_set_ps1 (1.0f); // SSE Loop processes 4 pixels/loop int loop_count = count / 4; for (int i = 0; i < loop_count ; i++) { // Load 4 sets of coordinates __m128 c0 = _mm_load_ps (&iocoord[8*i]); __m128 c1 = _mm_load_ps (&iocoord[8*i+4]); __m128 x = _mm_shuffle_ps (c1, c0, _MM_SHUFFLE (2, 0, 2, 0)); __m128 y = _mm_shuffle_ps (c1, c0, _MM_SHUFFLE (3, 1, 3, 1)); x = _mm_sub_ps(_mm_mul_ps(x, cc), cx); y = _mm_sub_ps(_mm_mul_ps(y, cc), cy); __m128 rd = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y)); // We don't check for zero, but set it to a very small value instead rd = _mm_max_ps (rd, very_small); rd = _mm_rcp_ps (_mm_rsqrt_ps (rd)); __m128 ru = rd; for (int step = 0; step < 4; step++) { // fru = ru * (a_ * ru^2 * ru + b_ * ru^2 + c_ * ru + 1) - rd __m128 ru_sq = _mm_mul_ps (ru, ru); __m128 fru = _mm_mul_ps (_mm_mul_ps (a_, ru), ru_sq); __m128 t = _mm_add_ps (_mm_mul_ps (b_, ru_sq), _mm_add_ps (one, _mm_mul_ps (c_, ru))); fru = _mm_sub_ps (_mm_mul_ps (_mm_add_ps (t, fru), ru), rd); // This is most likely faster than loading form L1 cache __m128 two = _mm_add_ps (one, one); __m128 three = _mm_add_ps (one, two); __m128 four = _mm_add_ps (two, two); // corr = 4 * a * ru * ru^2 + 3 * b * ru^2 + 2 * c * ru + d __m128 corr = _mm_mul_ps (c_, ru); corr = _mm_add_ps (one, _mm_add_ps (corr, corr)); t = _mm_mul_ps (ru_sq, _mm_mul_ps (three, b_)); corr = _mm_add_ps (corr, _mm_mul_ps (_mm_mul_ps (ru, ru_sq), _mm_mul_ps (four, a_))); corr = _mm_rcp_ps (_mm_add_ps (corr, t)); // ru -= fru * corr ru = _mm_sub_ps (ru, _mm_mul_ps (fru, corr)); } // We don't check for zero, but set it to a very small value instead ru = _mm_max_ps (ru, very_small); // ru /= rd ru = _mm_mul_ps (ru, _mm_rcp_ps(rd)); x = _mm_add_ps(_mm_mul_ps (x, ru), cx); y = _mm_add_ps(_mm_mul_ps (y, ru), cy); x = _mm_div_ps (x, cc); y = _mm_div_ps (y, cc); c0 = _mm_unpacklo_ps(x, y); c1 = _mm_unpackhi_ps(x, y); _mm_store_ps (&iocoord [8 * i], c0); _mm_store_ps (&iocoord [8 * i + 4], c1); } loop_count *= 4; int remain = count - loop_count; if (remain) ModifyCoord_UnDist_PTLens (data, &iocoord [loop_count * 2], remain); }
static inline v4sf rsqrt_1st_phantom(const v4sf rhs) { v4sf x0 = v4sf(_mm_rsqrt_ps(rhs.val)); return v4sf(x0 * (rhs * x0 * x0 - v4sf(3.))); }
static inline v4sf rsqrt_1st(const v4sf rhs) { v4sf x0 = v4sf(_mm_rsqrt_ps(rhs.val)); v4sf h = v4sf(1.) - rhs * x0 * x0; return x0 + v4sf(0.5) * h * x0; }
static inline v4sf rsqrt_0th(const v4sf rhs) { return v4sf(_mm_rsqrt_ps(rhs.val)); }