inline __m128d& operator+=(__m128d& v1, const __m128d& v2) {
	return (v1 = _mm_add_pd(v1, v2));
}
void exchrhs_gmrfData_8(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((!neighbor_isValid[0][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S589, S588, S590 */
{
{
{
int i1 = 0;
for (; (i1<(1&(~1))); i1 += 1) {
yPos = (((i1/2.560000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(2.560000e+02);
__m128d vec4 = _mm_set1_pd(yPos);
for (; (i1<254); i1 += 4) {
/* yPos = (((i1/2.560000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec2 = _mm_load1_pd((&posEnd[1]));
__m128d vec2_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec3 = _mm_load1_pd((&posBegin[1]));
__m128d vec3_2 = _mm_load1_pd((&posBegin[1]));
vec4 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(vec0, vec1), _mm_sub_pd(vec2, vec3)), vec3);
vec4 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(vec0_2, vec1), _mm_sub_pd(vec2_2, vec3_2)), vec3_2);
}
for (; (i1<257); i1 += 1) {
yPos = (((i1/2.560000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
{
int i1 = 0;
for (; (i1<(1&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<254); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<257); i1 += 1) {
xPos = posBegin[0];
}
}
}
{
double* fieldData_RHS_GMRF_8_p1 = (&fieldData_RHS_GMRF[8][0]);
int i1 = 0;
for (; (i1<=255); i1 += 2) {
fieldData_RHS_GMRF_8_p1[(i1*258)] = 0.000000e+00;
fieldData_RHS_GMRF_8_p1[((i1*258)+258)] = 0.000000e+00;
}
for (; (i1<=256); i1 += 1) {
fieldData_RHS_GMRF_8_p1[(i1*258)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[0][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S592, S591, S593 */
{
{
{
int i1 = 0;
for (; (i1<(1&(~1))); i1 += 1) {
yPos = (((i1/2.560000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(2.560000e+02);
__m128d vec4 = _mm_set1_pd(yPos);
for (; (i1<254); i1 += 4) {
/* yPos = (((i1/2.560000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec2 = _mm_load1_pd((&posEnd[1]));
__m128d vec2_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec3 = _mm_load1_pd((&posBegin[1]));
__m128d vec3_2 = _mm_load1_pd((&posBegin[1]));
vec4 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(vec0, vec1), _mm_sub_pd(vec2, vec3)), vec3);
vec4 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(vec0_2, vec1), _mm_sub_pd(vec2_2, vec3_2)), vec3_2);
}
for (; (i1<257); i1 += 1) {
yPos = (((i1/2.560000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
{
double* fieldData_RHS_GMRF_8_p1 = (&fieldData_RHS_GMRF[8][0]);
int i1 = 0;
for (; (i1<=255); i1 += 2) {
fieldData_RHS_GMRF_8_p1[((i1*258)+256)] = 0.000000e+00;
fieldData_RHS_GMRF_8_p1[((i1*258)+514)] = 0.000000e+00;
}
for (; (i1<=256); i1 += 1) {
fieldData_RHS_GMRF_8_p1[((i1*258)+256)] = 0.000000e+00;
}
}
}
{
int i1 = 0;
for (; (i1<(1&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<254); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<257); i1 += 1) {
xPos = posEnd[0];
}
}
}
}
}
if ((!neighbor_isValid[0][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S595, S594, S596 */
{
{
{
double* fieldData_RHS_GMRF_8_p1 = (&fieldData_RHS_GMRF[8][0]);
int i2 = 0;
for (; (i2<=255); i2 += 2) {
fieldData_RHS_GMRF_8_p1[i2] = 0.000000e+00;
fieldData_RHS_GMRF_8_p1[(i2+1)] = 0.000000e+00;
}
for (; (i2<=256); i2 += 1) {
fieldData_RHS_GMRF_8_p1[i2] = 0.000000e+00;
}
}
{
int i2 = 0;
for (; (i2<=255); i2 += 2) {
xPos = (((i2/2.560000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2+1)/2.560000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=256); i2 += 1) {
xPos = (((i2/2.560000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
int i2 = 0;
for (; (i2<=255); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=256); i2 += 1) {
yPos = posBegin[1];
}
}
}
}
}
if ((!neighbor_isValid[0][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S598, S597, S599 */
{
{
{
int i2 = 0;
for (; (i2<=255); i2 += 2) {
xPos = (((i2/2.560000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2+1)/2.560000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=256); i2 += 1) {
xPos = (((i2/2.560000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
{
int i2 = 0;
for (; (i2<=255); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=256); i2 += 1) {
yPos = posEnd[1];
}
}
}
{
double* fieldData_RHS_GMRF_8_p1 = (&fieldData_RHS_GMRF[8][0]);
int i2 = 0;
for (; (i2<=255); i2 += 2) {
fieldData_RHS_GMRF_8_p1[(i2+66048)] = 0.000000e+00;
fieldData_RHS_GMRF_8_p1[(i2+66049)] = 0.000000e+00;
}
for (; (i2<=256); i2 += 1) {
fieldData_RHS_GMRF_8_p1[(i2+66048)] = 0.000000e+00;
}
}
}
}
}
}
}
}
inline __m128d operator+(const __m128d& v1, const __m128d& v2) {
	return _mm_add_pd(v1, v2);
}
예제 #4
0
BOOST_FORCEINLINE __m128d __vectorcall operator + ( __m128d const left, __m128d const right ) {
    return _mm_add_pd   ( left, right );
}
void exchlaplacecoeffData_7(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((!neighbor_isValid[1][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S956, S958, S952, S955, S960, S954, S957, S951, S959, S950, S953 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+138338)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+138470)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+138338)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+34586)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+34718)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+34586)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+103754)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+103886)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+103754)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+2)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+134)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+2)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+69170)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+69302)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+69170)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+121046)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+121178)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+121046)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+51878)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+52010)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+51878)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<127); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<130); i1 += 1) {
xPos = posBegin[0];
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+86462)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+86594)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+86462)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+17294)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+17426)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+17294)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/1.280000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(1.280000e+02);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<127); i1 += 4) {
/* yPos = ((((i1-1)/1.280000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<130); i1 += 1) {
yPos = ((((i1-1)/1.280000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
}
}
if ((!neighbor_isValid[1][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S962, S961, S970, S964, S967, S966, S969, S963, S971, S965, S968 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+121174)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+121306)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+121174)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+138466)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+138598)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+138466)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/1.280000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(1.280000e+02);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<127); i1 += 4) {
/* yPos = ((((i1-1)/1.280000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<130); i1 += 1) {
yPos = ((((i1-1)/1.280000e+02)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+103882)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+104014)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+103882)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<127); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<130); i1 += 1) {
xPos = posEnd[0];
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+130)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+262)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+130)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+69298)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+69430)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+69298)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+52006)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+52138)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+52006)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+86590)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+86722)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+86590)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+34714)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+34846)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+34714)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i1 = 1;
for (; (i1<=128); i1 += 2) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+17422)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[((i1*132)+17554)] = 0.000000e+00;
}
for (; (i1<=129); i1 += 1) {
fieldData_LaplaceCoeff_7_p1[((i1*132)+17422)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[1][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S982, S976, S979, S973, S972, S981, S975, S978, S977, S980, S974 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+138468)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+138469)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+138468)] = 0.000000e+00;
}
}
{
int i2 = 2;
for (; (i2<=129); i2 += 2) {
xPos = ((((i2-2)/1.280000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/1.280000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=130); i2 += 1) {
xPos = ((((i2-2)/1.280000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+34716)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+34717)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+34716)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+17424)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+17425)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+17424)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+132)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+133)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+132)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+86592)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+86593)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+86592)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+52008)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+52009)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+52008)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+103884)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+103885)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+103884)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+121176)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+121177)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+121176)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=129); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=130); i2 += 1) {
yPos = posBegin[1];
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+69300)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+69301)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+69300)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[1][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S988, S991, S985, S990, S984, S993, S987, S983, S992, S986, S989 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+138072)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+138073)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+138072)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+34320)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+34321)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+34320)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+155364)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+155365)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+155364)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+51612)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+51613)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+51612)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=129); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=130); i2 += 1) {
yPos = posEnd[1];
}
}
}
{
int i2 = 2;
for (; (i2<=129); i2 += 2) {
xPos = ((((i2-2)/1.280000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/1.280000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=130); i2 += 1) {
xPos = ((((i2-2)/1.280000e+02)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+103488)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+103489)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+103488)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+68904)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+68905)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+68904)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+17028)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+17029)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+17028)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+120780)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+120781)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+120780)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][0]);
int i2 = 2;
for (; (i2<=129); i2 += 2) {
fieldData_LaplaceCoeff_7_p1[(i2+86196)] = 0.000000e+00;
fieldData_LaplaceCoeff_7_p1[(i2+86197)] = 0.000000e+00;
}
for (; (i2<=130); i2 += 1) {
fieldData_LaplaceCoeff_7_p1[(i2+86196)] = 0.000000e+00;
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S994 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*129)]);
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][(i3*17292)]);
int i4 = 1;
for (; (i4<=128); i4 += 2) {
buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_7_p1[((i4*132)+130)];
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_7_p1[((i4*132)+262)];
}
for (; (i4<=129); i4 += 1) {
buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_7_p1[((i4*132)+130)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(buffer_Send[1], 1161, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(buffer_Recv[0], 1161, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S995 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*129)]);
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][(i3*17292)]);
int i4 = 3;
for (; (i4<=130); i4 += 2) {
fieldData_LaplaceCoeff_7_p1[((i4*132)-262)] = buffer_Recv_0_p1[(i4-3)];
fieldData_LaplaceCoeff_7_p1[((i4*132)-130)] = buffer_Recv_0_p1[(i4-2)];
}
for (; (i4<=131); i4 += 1) {
fieldData_LaplaceCoeff_7_p1[((i4*132)-262)] = buffer_Recv_0_p1[(i4-3)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_LaplaceCoeff[7][17030], 1, mpiDatatype_9_129_17292, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff[7][134], 1, mpiDatatype_9_129_17292, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S996 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][(i3*17292)]);
double* buffer_Send_0_p1 = (&buffer_Send[0][(i3*131)]);
int i4 = 0;
for (; (i4<=129); i4 += 2) {
buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_7_p1[((i4*132)+3)];
buffer_Send_0_p1[(i4+1)] = fieldData_LaplaceCoeff_7_p1[((i4*132)+135)];
}
for (; (i4<=130); i4 += 1) {
buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_7_p1[((i4*132)+3)];
}
}
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S997 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*131)]);
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][(i3*17292)]);
int i4 = 0;
for (; (i4<=129); i4 += 2) {
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_7_p1[((i4*132)+129)];
buffer_Send_1_p1[(i4+1)] = fieldData_LaplaceCoeff_7_p1[((i4*132)+261)];
}
for (; (i4<=130); i4 += 1) {
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_7_p1[((i4*132)+129)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Isend(buffer_Send[0], 1179, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(buffer_Send[1], 1179, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(buffer_Recv[0], 1179, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Irecv(buffer_Recv[1], 1179, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S998 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*131)]);
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][(i3*17292)]);
int i4 = 1;
for (; (i4<=130); i4 += 2) {
fieldData_LaplaceCoeff_7_p1[((i4*132)-131)] = buffer_Recv_0_p1[(i4-1)];
fieldData_LaplaceCoeff_7_p1[((i4*132)+1)] = buffer_Recv_0_p1[i4];
}
for (; (i4<=131); i4 += 1) {
fieldData_LaplaceCoeff_7_p1[((i4*132)-131)] = buffer_Recv_0_p1[(i4-1)];
}
}
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S999 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i3*131)]);
double* fieldData_LaplaceCoeff_7_p1 = (&fieldData_LaplaceCoeff[7][(i3*17292)]);
int i4 = 131;
for (; (i4<=260); i4 += 2) {
fieldData_LaplaceCoeff_7_p1[((i4*132)-17161)] = buffer_Recv_1_p1[(i4-131)];
fieldData_LaplaceCoeff_7_p1[((i4*132)-17029)] = buffer_Recv_1_p1[(i4-130)];
}
for (; (i4<=261); i4 += 1) {
fieldData_LaplaceCoeff_7_p1[((i4*132)-17161)] = buffer_Recv_1_p1[(i4-131)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Isend(&fieldData_LaplaceCoeff[7][265], 1, mpiDatatype_9_131_17292, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_LaplaceCoeff[7][16897], 1, mpiDatatype_9_131_17292, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff[7][1], 1, mpiDatatype_9_131_17292, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Irecv(&fieldData_LaplaceCoeff[7][17161], 1, mpiDatatype_9_131_17292, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
예제 #6
0
static inline __m128d
my_invrsq_pd(__m128d x)
{
	const __m128d three = (const __m128d) {3.0f, 3.0f};
	const __m128d half  = (const __m128d) {0.5f, 0.5f};
	
	__m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
	__m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
	
	/* First Newton-Rapson step, accuracy is now 24 bits */
	__m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
	
	/* Return second Newton-Rapson step, accuracy 48 bits */
	return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
}

/* to extract single integers from a __m128i datatype */
#define _mm_extract_epi64(x, imm) \
_mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))

void nb_kernel430_ia32_sse2(int *           p_nri,
							int *           iinr,
							int *           jindex,
							int *           jjnr,
							int *           shift,
							double *         shiftvec,
							double *         fshift,
							int *           gid,
							double *         pos,
							double *         faction,
							double *         charge,
							double *         p_facel,
							double *         p_krf,
							double *         p_crf,
							double *         Vc,
							int *           type,
							int *           p_ntype,
							double *         vdwparam,
							double *         Vvdw,
							double *         p_tabscale,
							double *         VFtab,
							double *         invsqrta,
							double *         dvda,
							double *         p_gbtabscale,
							double *         GBtab,
							int *           p_nthreads,
							int *           count,
							void *          mtx,
							int *           outeriter,
							int *           inneriter,
							double *         work)
{
	int           nri,ntype,nthreads,offset,tj,tj2,nti;
	int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
	double        facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2;
	double        shX,shY,shZ,isai_d,dva;
	gmx_gbdata_t *gbdata;
	float *        gpol;

	__m128d       ix,iy,iz,jx,jy,jz;
	__m128d		  dx,dy,dz,t1,t2,t3;
	__m128d		  fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
	__m128d		  q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
	__m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d;
	__m128d		  xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
	__m128d       c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix;
	__m128d       fac,tabscale,gbtabscale;
	__m128i       n0,nnn;
	
	const __m128d neg    = {-1.0f,-1.0f};
	const __m128d zero   = {0.0f,0.0f};
	const __m128d half   = {0.5f,0.5f};
	const __m128d two    = {2.0f,2.0f};
	const __m128d three  = {3.0f,3.0f};
	const __m128d six    = {6.0f,6.0f};
	const __m128d twelwe = {12.0f,12.0f};
	
	const __m128i four   = _mm_set_epi32(4,4,4,4);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;

	nri        = *p_nri;
	ntype      = *p_ntype;
	nthreads   = *p_nthreads; 
    facel      = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent));       
	krf        = *p_krf;
	crf        = *p_crf;
	tabscl     = *p_tabscale;
	gbtabscl   = *p_gbtabscale;
	nj1        = 0;
	
	/* Splat variables */
	fac        = _mm_load1_pd(&facel);
	tabscale   = _mm_load1_pd(&tabscl);
	gbtabscale = _mm_load1_pd(&gbtabscl);
		
	/* Keep compiler happy */
	Vvdwtmp = _mm_setzero_pd();
	Vvdwtot = _mm_setzero_pd();
	dvdatmp = _mm_setzero_pd();
	dvdaj   = _mm_setzero_pd();
	isaj    = _mm_setzero_pd();
	vcoul   = _mm_setzero_pd();
	vgb     = _mm_setzero_pd();
	t1      = _mm_setzero_pd();
	t2      = _mm_setzero_pd();
	t3      = _mm_setzero_pd();
	xmm1    = _mm_setzero_pd();
	xmm2    = _mm_setzero_pd();
	xmm3    = _mm_setzero_pd();
	xmm4    = _mm_setzero_pd();
	jnr1    = jnr2 = 0;
	j13     = j23  = 0;
		
	for(n=0;n<nri;n++)
	{
		is3     = 3*shift[n];
		shX     = shiftvec[is3];
		shY     = shiftvec[is3+1];
		shZ     = shiftvec[is3+2];
		
		nj0     = jindex[n];      
        nj1     = jindex[n+1];  
		offset  = (nj1-nj0)%2;
		
		ii      = iinr[n];
		ii3     = ii*3;
		
		ix      = _mm_set1_pd(shX+pos[ii3+0]);
		iy      = _mm_set1_pd(shX+pos[ii3+1]);
		iz      = _mm_set1_pd(shX+pos[ii3+2]); 
		q       = _mm_set1_pd(charge[ii]);
		
		iq      = _mm_mul_pd(fac,q); 
		isai_d  = invsqrta[ii];
		isai    = _mm_load1_pd(&isai_d);
		
		nti      = 2*ntype*type[ii];
		
		fix     = _mm_setzero_pd();
		fiy     = _mm_setzero_pd();
		fiz     = _mm_setzero_pd();
		dvdasum = _mm_setzero_pd();
		vctot   = _mm_setzero_pd();
		vgbtot  = _mm_setzero_pd();
		Vvdwtot = _mm_setzero_pd();
		
		for(k=nj0;k<nj1-offset; k+=2)
		{
			jnr1    = jjnr[k];
			jnr2    = jjnr[k+1];
			
			j13     = jnr1 * 3;
			j23     = jnr2 * 3;
			
			/* Load coordinates */
			xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
			xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
			
			xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
			xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
			
			/* transpose */
			jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
			
			/* distances */
			dx      = _mm_sub_pd(ix,jx);
			dy		= _mm_sub_pd(iy,jy);
			dz		= _mm_sub_pd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
			
			/* Load invsqrta */
			isaj	= _mm_loadl_pd(isaj,invsqrta+jnr1);
			isaj	= _mm_loadh_pd(isaj,invsqrta+jnr2);
			isaprod = _mm_mul_pd(isai,isaj);
			
			/* Load charges */
			q		= _mm_loadl_pd(q,charge+jnr1);
			q		= _mm_loadh_pd(q,charge+jnr2);
			qq		= _mm_mul_pd(iq,q);
			
			vcoul	= _mm_mul_pd(qq,rinv);
			fscal	= _mm_mul_pd(vcoul,rinv);
			qq		= _mm_mul_pd(isaprod,qq);
			qq		= _mm_mul_pd(qq,neg);
			gbscale	= _mm_mul_pd(isaprod,gbtabscale);
			
			/* Load VdW parameters */
			tj      = nti+2*type[jnr1];
			tj2     = nti+2*type[jnr2];
			
			xmm1      = _mm_loadu_pd(vdwparam+tj);
			xmm2     = _mm_loadu_pd(vdwparam+tj2);
			c6      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
			c12     = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
						
			/* Load dvdaj */
			dvdaj	= _mm_loadl_pd(dvdaj, dvda+jnr1);
			dvdaj	= _mm_loadh_pd(dvdaj, dvda+jnr2);
			
			/* Calculate GB table index */
			r		= _mm_mul_pd(rsq11,rinv);
			rt		= _mm_mul_pd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_pd(rt,n0d);
			eps2	= _mm_mul_pd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G		= _mm_mul_pd(G,eps);
			H		= _mm_mul_pd(H,eps2);
			Fp		= _mm_add_pd(F,G);
			Fp		= _mm_add_pd(Fp,H);
			VV		= _mm_mul_pd(Fp,eps);
			VV		= _mm_add_pd(Y,VV);
			H		= _mm_mul_pd(two,H);
			FF		= _mm_add_pd(Fp,G);
			FF		= _mm_add_pd(FF,H);
			vgb		= _mm_mul_pd(qq,VV);
			fijC	= _mm_mul_pd(qq,FF);
			fijC	= _mm_mul_pd(fijC,gbscale);
			
			dvdatmp = _mm_mul_pd(fijC,r);
			dvdatmp	= _mm_add_pd(vgb,dvdatmp);
			dvdatmp = _mm_mul_pd(dvdatmp,neg);
			dvdatmp = _mm_mul_pd(dvdatmp,half);
			dvdasum	= _mm_add_pd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_pd(dvdatmp,isaj);
			xmm1	= _mm_mul_pd(xmm1,isaj);
			dvdaj	= _mm_add_pd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			_mm_storeh_pd(dvda+jnr2,dvdaj);
			
			vctot	= _mm_add_pd(vctot,vcoul);
			vgbtot	= _mm_add_pd(vgbtot,vgb);
			
			/* Calculate VDW table index */
			rt      = _mm_mul_pd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rt);
			n0d     = _mm_cvtepi32_pd(n0);
			eps     = _mm_sub_pd(rt,n0d);
			eps2    = _mm_mul_pd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
			/* Tabulated VdW interaction - dispersion */
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			Vvdw6   = _mm_mul_pd(c6,VV);
			fijD    = _mm_mul_pd(c6,FF);
			
			/* Tabulated VdW interaction - repulsion */
			nnn     = _mm_add_epi32(nnn,four);
			
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			Vvdw12  = _mm_mul_pd(c12,VV);
			fijR    = _mm_mul_pd(c12,FF);
			
			Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6);
			Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
			
			xmm1    = _mm_add_pd(fijD,fijR);
			xmm1    = _mm_mul_pd(xmm1,tabscale);
			xmm1    = _mm_add_pd(xmm1,fijC);
			xmm1    = _mm_sub_pd(xmm1,fscal);
			fscal   = _mm_mul_pd(xmm1,neg);
			fscal   = _mm_mul_pd(fscal,rinv);
						
			/* calculate partial force terms */
			t1		= _mm_mul_pd(fscal,dx);
			t2		= _mm_mul_pd(fscal,dy);
			t3		= _mm_mul_pd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_pd(fix,t1);
			fiy		= _mm_add_pd(fiy,t2);
			fiz		= _mm_add_pd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm1	= _mm_loadu_pd(faction+j13); /* fx1 fy1 */
			xmm2	= _mm_loadu_pd(faction+j23); /* fx2 fy2 */
			
			xmm5	= _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
			xmm6	= _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
			
			/* transpose */
			xmm7	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
			xmm5	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
			xmm6	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_pd(xmm5,t1);
			xmm6	= _mm_sub_pd(xmm6,t2);
			xmm7	= _mm_sub_pd(xmm7,t3);
			
			xmm1	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
			xmm2	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* store fx and fy */
			_mm_storeu_pd(faction+j13,xmm1);
			_mm_storeu_pd(faction+j23,xmm2);
			
			/* .. then fz */
			_mm_storel_pd(faction+j13+2,xmm7);
			_mm_storel_pd(faction+j23+2,xmm7);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(offset!=0)
		{
			jnr1	= jjnr[k];
			j13		= jnr1*3;
			
			jx      = _mm_load_sd(pos+j13);
			jy      = _mm_load_sd(pos+j13+1);
			jz      = _mm_load_sd(pos+j13+2);
			
			isaj	= _mm_load_sd(invsqrta+jnr1);
			isaprod = _mm_mul_sd(isai,isaj);
			dvdaj	= _mm_load_sd(dvda+jnr1);
			q		= _mm_load_sd(charge+jnr1);
			qq      = _mm_mul_sd(iq,q);
			
			dx      = _mm_sub_sd(ix,jx);
			dy		= _mm_sub_sd(iy,jy);
			dz		= _mm_sub_sd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
			
			vcoul	= _mm_mul_sd(qq,rinv);
			fscal	= _mm_mul_sd(vcoul,rinv);
			qq		= _mm_mul_sd(isaprod,qq);
			qq		= _mm_mul_sd(qq,neg);
			gbscale	= _mm_mul_sd(isaprod,gbtabscale);
			
			/* Load VdW parameters */
			tj      = nti+2*type[jnr1];
			
			c6      = _mm_load_sd(vdwparam+tj);
			c12     = _mm_load_sd(vdwparam+tj+1);
					
			/* Calculate GB table index */
			r		= _mm_mul_sd(rsq11,rinv);
			rt		= _mm_mul_sd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_sd(rt,n0d);
			eps2	= _mm_mul_sd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); 
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); 
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); 
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); 
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
			
			G		= _mm_mul_sd(G,eps);
			H		= _mm_mul_sd(H,eps2);
			Fp		= _mm_add_sd(F,G);
			Fp		= _mm_add_sd(Fp,H);
			VV		= _mm_mul_sd(Fp,eps);
			VV		= _mm_add_sd(Y,VV);
			H		= _mm_mul_sd(two,H);
			FF		= _mm_add_sd(Fp,G);
			FF		= _mm_add_sd(FF,H);
			vgb		= _mm_mul_sd(qq,VV);
			fijC	= _mm_mul_sd(qq,FF);
			fijC	= _mm_mul_sd(fijC,gbscale);
			
			dvdatmp = _mm_mul_sd(fijC,r);
			dvdatmp	= _mm_add_sd(vgb,dvdatmp);
			dvdatmp = _mm_mul_sd(dvdatmp,neg);
			dvdatmp = _mm_mul_sd(dvdatmp,half);
			dvdasum	= _mm_add_sd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_sd(dvdatmp,isaj);
			xmm1	= _mm_mul_sd(xmm1,isaj);
			dvdaj	= _mm_add_sd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			
			vctot	= _mm_add_sd(vctot,vcoul);
			vgbtot	= _mm_add_sd(vgbtot,vgb);
			
			/* Calculate VDW table index */
			rt      = _mm_mul_sd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rt);
			n0d     = _mm_cvtepi32_pd(n0);
			eps     = _mm_sub_sd(rt,n0d);
			eps2    = _mm_mul_sd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
			/* Tabulated VdW interaction - dispersion */
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			Vvdw6   = _mm_mul_sd(c6,VV);
			fijD    = _mm_mul_sd(c6,FF);
			
			/* Tabulated VdW interaction - repulsion */
			nnn     = _mm_add_epi32(nnn,four);
			
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			Vvdw12  = _mm_mul_sd(c12,VV);
			fijR    = _mm_mul_sd(c12,FF);
			
			Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6);
			Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
			
			xmm1    = _mm_add_sd(fijD,fijR);
			xmm1    = _mm_mul_sd(xmm1,tabscale);
			xmm1    = _mm_add_sd(xmm1,fijC);
			xmm1    = _mm_sub_sd(xmm1,fscal);
			fscal   = _mm_mul_sd(xmm1,neg);
			fscal   = _mm_mul_sd(fscal,rinv);
			
			/* calculate partial force terms */
			t1		= _mm_mul_sd(fscal,dx);
			t2		= _mm_mul_sd(fscal,dy);
			t3		= _mm_mul_sd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_sd(fix,t1);
			fiy		= _mm_add_sd(fiy,t2);
			fiz		= _mm_add_sd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm5	= _mm_load_sd(faction+j13);   /* fx */
			xmm6    = _mm_load_sd(faction+j13+1); /* fy */
			xmm7    = _mm_load_sd(faction+j13+2); /* fz */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_sd(xmm5,t1);
			xmm6	= _mm_sub_sd(xmm6,t2);
			xmm7	= _mm_sub_sd(xmm7,t3);
			
			/* store forces */
			_mm_store_sd(faction+j13,xmm5);
			_mm_store_sd(faction+j13+1,xmm6);
			_mm_store_sd(faction+j13+2,xmm7);
		}
		
		/* fix/fiy/fiz now contain four partial terms, that all should be
		 * added to the i particle forces
		 */
		t1		 = _mm_unpacklo_pd(t1,fix);
		t2		 = _mm_unpacklo_pd(t2,fiy);
		t3		 = _mm_unpacklo_pd(t3,fiz);
		
		fix		 = _mm_add_pd(fix,t1);
		fiy		 = _mm_add_pd(fiy,t2);
		fiz		 = _mm_add_pd(fiz,t3);
		
		fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
		fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
		fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
		
		/* Load i forces from memory */
		xmm1     = _mm_load_sd(faction+ii3);
		xmm2     = _mm_load_sd(faction+ii3+1);
		xmm3     = _mm_load_sd(faction+ii3+2);
		
		/* Add to i force */
		fix      = _mm_add_sd(fix,xmm1);
		fiy      = _mm_add_sd(fiy,xmm2);
		fiz      = _mm_add_sd(fiz,xmm3);
		
		/* store i forces to memory */
		_mm_store_sd(faction+ii3,fix);
		_mm_store_sd(faction+ii3+1,fiy);
		_mm_store_sd(faction+ii3+2,fiz);
		
		/* now do dvda */
		dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
		dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
		_mm_storeh_pd(&dva,dvdasum);
		dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
		
		ggid	 = gid[n];
		
		/* Coulomb potential */
		vcoul	 = _mm_unpacklo_pd(vcoul,vctot);
		vctot	 = _mm_add_pd(vctot,vcoul);
		_mm_storeh_pd(&vct,vctot);
		Vc[ggid] = Vc[ggid] + vct;
		
		/* VdW potential */
		Vvdwtmp	 = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
		Vvdwtot	 = _mm_add_pd(Vvdwtot,Vvdwtmp);
		_mm_storeh_pd(&vdwt,Vvdwtot);
		Vvdw[ggid] = Vvdw[ggid] + vdwt;
		
		/* GB potential */
		vgb  	 = _mm_unpacklo_pd(vgb,vgbtot);
		vgbtot	 = _mm_add_pd(vgbtot,vgb);
		_mm_storeh_pd(&vgbt,vgbtot);
		gpol[ggid] = gpol[ggid] + vgbt;
	}
	
	*outeriter   = nri;            
    *inneriter   = nj1; 	
	
	
	
	
	
}
예제 #7
0
bool CResizeEngine::horizontalFilter(CDIBSection *src, uint src_height,
                                     CDIBSection *dst, uint dst_yoffset, uint dst_height,
                                     ILongTimeRunCallback *pCallback) {
	assert(src->getBitCounts() == dst->getBitCounts());
	int bitcount = src->getBitCounts();
	assert((int)src_height <= src->getHeight());
	assert(src_height >= dst_height);
	uint dst_ymax = dst_yoffset + dst_height;
	assert((int)dst_ymax <= dst->getHeight());
	uint src_width = src->getWidth();
	uint dst_width = dst->getWidth();

	if (dst_width == src_width) {

		uint8 *src_bits = src->getData();
		uint8 *dst_bits = dst->getLine(dst_yoffset);
		assert(src_bits && dst_bits);

		uint height = min(dst_height, src_height);
		memcpy(dst_bits, src_bits, height * dst->getStride());

	} else if (!m_pFilter) { // fast (COLORONCOLOR)
		double ratio_w = (double)src_width / (double)dst_width;
		uint bytespp = bitcount / 8;

		for (uint y = dst_yoffset, sy = 0; y < dst_ymax; ++ y, ++ sy) {
			uint8 *dst_data = (uint8 *)dst->getLine(y);
			uint8 *src_line = (uint8 *)src->getLine(sy);

			for (uint x = 0; x < dst_width; ++ x) {
				uint sx = (uint)(x * ratio_w + 0.5);
				if (sx >= src_width) {
					sx = src_width - 1;
				}

				uint8 *src_data = src_line + sx * bytespp;
				for (uint i = 0; i < bytespp; ++ i) {
					*dst_data ++ = *src_data ++;
				}
			}
		}

	} else { // use m_pFilter
		uint index; // pixel index
		CWeightsTable weightsTable(m_pFilter, dst_width, src_width);
#ifdef USE_SSE
		__m128i value, t;
		__m128 a, b, c, v05 = _mm_set_ps1(0.5);
#elif (defined(USE_SSE2))
		__m128i value, t;
		__m128d a, b, c, v05 = _mm_set1_pd(0.5);
#endif

		uint bytespp = src->getBitCounts() / 8;
		assert(bytespp == 3 || bytespp == 4);
		for (uint dsty = dst_yoffset, srcy = 0; dsty < dst_ymax; ++ dsty, ++ srcy) {
			// test for stop
			if (srcy % 32 == 0) {
				if (pCallback && pCallback->shouldStop()) {
					return false;
				}
			}

			uint8 *src_bits = src->getLine(srcy);
			uint8 *dst_bits = dst->getLine(dsty);

			for(uint x = 0; x < dst_width; ++ x) {
				int iLeft = weightsTable.getLeftBoundary(x);
				int iRight = weightsTable.getRightBoundary(x);
				index = iLeft * bytespp;
#ifdef USE_SSE
				__m128 v = _mm_set_ps1(0.0);
				_mm_prefetch((const char *)src_bits + index, _MM_HINT_T0);
#elif defined(USE_SSE2)
				__m128d v1 = _mm_set1_pd(0.0);
				__m128d v2 = _mm_set1_pd(0.0);
#elif defined(USE_FLOAT)
				float value[4] = {0, 0, 0, 0};
#else
				double value[4] = {0, 0, 0, 0}; // 4 = 32bpp max
#endif
				for(int i = iLeft; i <= iRight; ++ i) {
#ifdef USE_SSE
					float weight = (float)weightsTable.getWeight(x, i - iLeft);

					a = _mm_set_ps1(weight);
					if (bytespp == 3) {
						t = _mm_set_epi32(0, src_bits[index + 2], src_bits[index + 1], src_bits[index]);
					} else {
						t = _mm_set_epi32(src_bits[index + 3], src_bits[index + 2], src_bits[index + 1], src_bits[index]);
					}
					b = _mm_cvtepi32_ps(t);
					c = _mm_mul_ps(a, b);
					v = _mm_add_ps(v, c);
					index += bytespp;
#elif defined(USE_SSE2)
					double weight = weightsTable.getWeight(x, i-iLeft);

					a = _mm_set1_pd(weight);
					t = _mm_set_epi32(0, 0, src_bits[index + 1], src_bits[index]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v1 = _mm_add_pd(v1, c);

					t = _mm_set_epi32(0, 0, bytespp == 3 ? 0 : src_bits[index + 3], src_bits[index + 2]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v2 = _mm_add_pd(v2, c);
					index += bytespp;
#elif defined(USE_FLOAT)
					float weight = (float)weightsTable.getWeight(x, i-iLeft);

					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (float)src_bits[index ++]); 
					}
#else
					double weight = weightsTable.getWeight(x, i-iLeft);

					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (double)src_bits[index ++]); 
					}
#endif
				} 

#ifdef USE_SSE
				v = _mm_add_ps(v, v05);
				value = _mm_cvtps_epi32(v);
				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[2]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[3]), (int)255);
				}
#elif defined (USE_SSE2)
				v1 = _mm_add_pd(v1, v05);
				v2 = _mm_add_pd(v2, v05);
				value = _mm_cvtpd_epi32(v1);
				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				value = _mm_cvtpd_epi32(v2);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				}
#else
				for (uint j = 0; j < bytespp; ++ j) {
					dst_bits[j] = (unsigned char)MIN(MAX((int)0, (int)(value[j] + 0.5)), (int)255);
				}
#endif

				dst_bits += bytespp;
			}
		}
	}
	return true;
}
예제 #8
0
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C)
{
//#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__))
#ifdef OGDF_SSE3_EXTENSIONS

	const int n            = C.numberOfNodes();

#ifdef _OPENMP
	const int work = 256;
	const int nThreadsRep  = min(omp_get_max_threads(), 1 + n*n/work);
	const int nThreadsPrev = min(omp_get_max_threads(), 1 + n  /work);
#endif

	const double k       = m_idealEdgeLength;
	const double kSquare = k*k;
	const double c_rep   = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal

	const double minDist       = 10e-6;//100*DBL_EPSILON;
	const double minDistSquare = minDist*minDist;

	double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double));
	double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double));

	__m128d mm_kSquare       = _mm_set1_pd(kSquare);
	__m128d mm_minDist       = _mm_set1_pd(minDist);
	__m128d mm_minDistSquare = _mm_set1_pd(minDistSquare);
	__m128d mm_c_rep         = _mm_set1_pd(c_rep);

	#pragma omp parallel num_threads(nThreadsRep)
	{
		double tx = m_txNull;
		double ty = m_tyNull;
		int cF = 1;

		for(int i = 1; i <= m_iterations; i++)
		{
			// repulsive forces

			#pragma omp for
			for(int v = 0; v < n; ++v)
			{
				__m128d mm_disp_xv = _mm_setzero_pd();
				__m128d mm_disp_yv = _mm_setzero_pd();

				__m128d mm_xv = _mm_set1_pd(C.m_x[v]);
				__m128d mm_yv = _mm_set1_pd(C.m_y[v]);

				int u;
				for(u = 0; u+1 < v; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				int uStart = u+2;
				if(u == v) ++u;
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				for(u = uStart; u < n; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv);
				mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv);

				_mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep));
				_mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep));
			}

			// attractive forces

			#pragma omp single
			for(int e = 0; e < C.numberOfEdges(); ++e)
			{
				int v = C.m_src[e];
				int u = C.m_tgt[e];

				double delta_x = C.m_x[v] - C.m_x[u];
				double delta_y = C.m_y[v] - C.m_y[u];

				double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y));

				disp_x[v] -= delta_x * dist / k;
				disp_y[v] -= delta_y * dist / k;

				disp_x[u] += delta_x * dist / k;
				disp_y[u] += delta_y * dist / k;
			}

			// limit the maximum displacement to the temperature (m_tx,m_ty)

			__m128d mm_tx = _mm_set1_pd(tx);
			__m128d mm_ty = _mm_set1_pd(ty);

			#pragma omp for nowait
			for(int v = 0; v < n-1; v += 2)
			{
				__m128d mm_disp_xv = _mm_load_pd(&disp_x[v]);
				__m128d mm_disp_yv = _mm_load_pd(&disp_y[v]);

				__m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd(
					_mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv))
				));

				_mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx))
				));
				_mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty))
				));
			}
			#pragma omp single nowait
			{
				if(n % 2) {
					int v = n-1;
					double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v]));

					C.m_x[v] += disp_x[v] / dist * min(dist,tx);
					C.m_y[v] += disp_y[v] / dist * min(dist,ty);
				}
			}

			cool(tx,ty,cF);

			#pragma omp barrier
		}
	}

	System::alignedMemoryFree(disp_x);
	System::alignedMemoryFree(disp_y);

#else
	mainStep(C);
#endif
}
예제 #9
0
파일: center.c 프로젝트: gkiss/mdtraj
void inplace_center_and_trace_atom_major(float* coords, float* traces, const int n_frames, const int n_atoms)
{
    /* Center a trajectory containing multiple conformations inplace.
       The coordinates are store in float, but the accumulation is done in
       double.

       Also compute the traces of the centered conformations, which are necessary
       for RMSD.
    */ 
    int i, k;
    float* confp;
    __m128d sx_, sy_, sz_, trace_;
    __m128 mux_, muy_, muz_;
    float sxf, syf, szf;
    double sx[2], sy[2], sz[2], trace[2];
    __m128 x, y, z, x2, y2, z2;

    #ifdef _OPENMP
    #pragma omp parallel for default(none) shared(coords, traces) \
        private(sx_, sy_, sz_, trace_, mux_, muy_, muz_, sxf, syf, szf, \
        confp, i, x, y, z, x2, y2, z2, sx, sy, sz, trace)
    #endif
    for (k = 0; k < n_frames; k++) {
        confp = &coords[k * n_atoms * 3];
        sx_ = sy_ = sz_ = trace_ = _mm_setzero_pd();
        for (i = 0; i < n_atoms/4; i++) {
            aos_deinterleaved_loadu(confp, &x, &y, &z);

            // accumulate the sums of each coordinate in double
            // get the first two values from each float4
            sx_ = _mm_add_pd(sx_, _mm_cvtps_pd(x));
            sy_ = _mm_add_pd(sy_, _mm_cvtps_pd(y));
            sz_ = _mm_add_pd(sz_, _mm_cvtps_pd(z));
            // and shuffle in the second two values
            sx_ = _mm_add_pd(sx_, _mm_cvtps_pd(_mm_movehl_ps(x, x)));
            sy_ = _mm_add_pd(sy_, _mm_cvtps_pd(_mm_movehl_ps(y, y)));
            sz_ = _mm_add_pd(sz_, _mm_cvtps_pd(_mm_movehl_ps(z, z)));
            confp += 12;
        }
        // copy the summed coordinates out of the SSE registers
        _mm_storeu_pd(sx, sx_);
        _mm_storeu_pd(sy, sy_);
        _mm_storeu_pd(sz, sz_);

        // Add the last couple entries that weren't a factor of four
        for (i = 0; i < n_atoms % 4; i++) {
            sx[0] += confp[i*3 + 0];
            sy[0] += confp[i*3 + 1];
            sz[0] += confp[i*3 + 2];
        }

        // Put everything into the first value. We're doing this here, as
        // opposed to using a SSE horizontal add.
        sx[0] += sx[1];
        sy[0] += sy[1];
        sz[0] += sz[1];

        // Now we want mean x, y, and z positions
        sx[0] /= n_atoms;
        sy[0] /= n_atoms;
        sz[0] /= n_atoms;

        // Load these mean positions back into the SSE registers
        sxf = (float) sx[0];
        syf = (float) sy[0];
        szf = (float) sz[0];
        mux_ = _mm_load1_ps(&sxf);
        muy_ = _mm_load1_ps(&syf);
        muz_ = _mm_load1_ps(&szf);

        // And subtract them out
        confp = &coords[k * n_atoms * 3];
        for (i = 0; i < n_atoms/4; i++) {
            aos_deinterleaved_loadu(confp, &x, &y, &z);
            x = _mm_sub_ps(x, mux_);
            y = _mm_sub_ps(y, muy_);
            z = _mm_sub_ps(z, muz_);

            x2 = _mm_mul_ps(x, x);
            y2 = _mm_mul_ps(y, y);
            z2 = _mm_mul_ps(z, z);
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(x2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(y2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(z2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(x2, x2)));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(y2, y2)));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(z2, z2)));

            aos_interleaved_storeu(confp, x, y, z);
            confp += 12;
        }
        _mm_storeu_pd(trace, trace_);

        for (i = 0; i < n_atoms % 4; i++) {
            confp[i*3 + 0] -= sxf;
            confp[i*3 + 1] -= syf;
            confp[i*3 + 2] -= szf;
            trace[0] += confp[i*3 + 0]*confp[i*3 + 0];
            trace[0] += confp[i*3 + 1]*confp[i*3 + 1];
            trace[0] += confp[i*3 + 2]*confp[i*3 + 2];
        }
        trace[0] += trace[1];
        if (traces != NULL)
            traces[k] = (float) trace[0];
    }
}
void nb_kernel410_sse2_double(int *           p_nri,
							int *           iinr,
							int *           jindex,
							int *           jjnr,
							int *           shift,
							double *         shiftvec,
							double *         fshift,
							int *           gid,
							double *         pos,
							double *         faction,
							double *         charge,
							double *         p_facel,
							double *         p_krf,
							double *         p_crf,
							double *         vc,
							int *           type,
							int *           p_ntype,
							double *         vdwparam,
							double *         vvdw,
							double *         p_tabscale,
							double *         VFtab,
							double *         invsqrta,
							double *         dvda,
							double *         p_gbtabscale,
							double *         GBtab,
							int *           p_nthreads,
							int *           count,
							void *          mtx,
							int *           outeriter,
							int *           inneriter,
							double *         work)
{
  int           nri,ntype,nthreads;
  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
  double        shX,shY,shZ;
	int			  offset,nti;
  int           jnrA,jnrB;
  int           j3A,j3B;
	int           tjA,tjB;
	gmx_gbdata_t *gbdata;
	double *      gpol;
    
	__m128d  iq,qq,jq,isai;
	__m128d  ix,iy,iz;
	__m128d  jx,jy,jz;
	__m128d  dx,dy,dz;
	__m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
	__m128d  fix,fiy,fiz,tx,ty,tz,rsq;
	__m128d  rinv,isaj,isaprod;
	__m128d  vcoul,fscal,gbscale,c6,c12;
	__m128d  rinvsq,r,rtab;
	__m128d  eps,Y,F,G,H;
	__m128d  vgb,fijGB,dvdatmp;
	__m128d  rinvsix,vvdw6,vvdw12;
	__m128d  facel,gbtabscale,dvdaj;
	__m128i  n0, nnn;
	
	const __m128d neg        = _mm_set1_pd(-1.0);
	const __m128d zero       = _mm_set1_pd(0.0);
	const __m128d minushalf  = _mm_set1_pd(-0.5);
	const __m128d two        = _mm_set1_pd(2.0);
	const __m128d six        = _mm_set1_pd(6.0);
	const __m128d twelve     = _mm_set1_pd(12.0);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;

	nri        = *p_nri;
	ntype      = *p_ntype;
    
  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
  gbtabscale = _mm_load1_pd(p_gbtabscale);  
  facel      = _mm_load1_pd(p_facel);
  
  nj1         = 0;
  jnrA = jnrB = 0;
  j3A = j3B   = 0;
  jx          = _mm_setzero_pd();
  jy          = _mm_setzero_pd();
  jz          = _mm_setzero_pd();
  c6          = _mm_setzero_pd();
  c12         = _mm_setzero_pd();
	
	for(n=0;n<nri;n++)
	{
    is3              = 3*shift[n];     
    shX              = shiftvec[is3];  
    shY              = shiftvec[is3+1];
    shZ              = shiftvec[is3+2];
    nj0              = jindex[n];      
    nj1              = jindex[n+1];    
    ii               = iinr[n];        
    ii3              = 3*ii;           
		
		ix               = _mm_set1_pd(shX+pos[ii3+0]);
		iy               = _mm_set1_pd(shY+pos[ii3+1]);
		iz               = _mm_set1_pd(shZ+pos[ii3+2]);
    
		iq               = _mm_load1_pd(charge+ii);
		iq               = _mm_mul_pd(iq,facel);
    
		isai             = _mm_load1_pd(invsqrta+ii);
        
		nti              = 2*ntype*type[ii];
		
		vctot            = _mm_setzero_pd();
		vvdwtot          = _mm_setzero_pd();
		vgbtot           = _mm_setzero_pd();
		dvdasum          = _mm_setzero_pd();
		fix              = _mm_setzero_pd();
		fiy              = _mm_setzero_pd();
		fiz              = _mm_setzero_pd();
        
		for(k=nj0;k<nj1-1; k+=2)
		{
			jnrA    = jjnr[k];
			jnrB    = jjnr[k+1];
			
			j3A     = jnrA * 3;
			j3B     = jnrB * 3;

      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);

			dx           = _mm_sub_pd(ix,jx);
			dy           = _mm_sub_pd(iy,jy);
			dz           = _mm_sub_pd(iz,jz);

      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
      
      rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_pd(rinv,rinv);
      
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
			GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
            
      /* Lennard-Jones */
      tjA          = nti+2*type[jnrA];
			tjB          = nti+2*type[jnrB];
      
      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
			
			isaprod      = _mm_mul_pd(isai,isaj);
			qq           = _mm_mul_pd(iq,jq);            
			vcoul        = _mm_mul_pd(qq,rinv);
			fscal        = _mm_mul_pd(vcoul,rinv);                                 
      vctot        = _mm_add_pd(vctot,vcoul);
            
      /* Polarization interaction */
			qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
			gbscale      = _mm_mul_pd(isaprod,gbtabscale);
      
 			/* Calculate GB table index */
			r            = _mm_mul_pd(rsq,rinv);
			rtab         = _mm_mul_pd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
      /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_pd(G,eps);
      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
      vgb     = _mm_mul_pd(Y, qq);           
      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
      
      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
      
      vgbtot  = _mm_add_pd(vgbtot, vgb);
      
      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
      
      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
			
			rinvsix      = _mm_mul_pd(rinvsq,rinvsq);
			rinvsix      = _mm_mul_pd(rinvsix,rinvsq);
			
			vvdw6        = _mm_mul_pd(c6,rinvsix);
			vvdw12       = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
			vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
            
      fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
                                           _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
                                                      _mm_mul_pd(six,vvdw6))),
                                _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
      
      /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
      
      /* Calculate temporary vectorial force */
      tx           = _mm_mul_pd(fscal,dx);
      ty           = _mm_mul_pd(fscal,dy);
      tz           = _mm_mul_pd(fscal,dz);
      
      /* Increment i atom force */
      fix          = _mm_add_pd(fix,tx);
      fiy          = _mm_add_pd(fiy,ty);
      fiz          = _mm_add_pd(fiz,tz);
      
      /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(k<nj1)
		{
			jnrA    = jjnr[k];
			
			j3A     = jnrA * 3;
      
      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
            
			dx           = _mm_sub_sd(ix,jx);
			dy           = _mm_sub_sd(iy,jy);
			dz           = _mm_sub_sd(iz,jz);
            
      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
      
      rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_sd(rinv,rinv);
      
      /* These reason for zeroing these variables here is for fixing bug 585
       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
       * and r1=0, but it should be r1=a[1]. 
       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
       * To work around it, we zero these variables and use _mm_add_pd (**) instead
       * Note that the only variables that get affected are the energies since
       * the total sum needs to be correct 
       */
      vgb          = _mm_setzero_pd();
      vcoul        = _mm_setzero_pd();
      dvdatmp      = _mm_setzero_pd();
      vvdw6        = _mm_setzero_pd();
      vvdw12       = _mm_setzero_pd();
      
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
			GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
      
      /* Lennard-Jones */
      tjA          = nti+2*type[jnrA];
      
      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
			
			isaprod      = _mm_mul_sd(isai,isaj);
			qq           = _mm_mul_sd(jq,iq);            
			vcoul        = _mm_mul_sd(qq,rinv);
			fscal        = _mm_mul_sd(vcoul,rinv);                                 
      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
      
      /* Polarization interaction */
			qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
			gbscale      = _mm_mul_sd(isaprod,gbtabscale);
      
 			/* Calculate GB table index */
			r            = _mm_mul_sd(rsq,rinv);
			rtab         = _mm_mul_sd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
      /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_sd(G,eps);
      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
      vgb     = _mm_mul_sd(Y, qq);           
      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
      
      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
      
      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
      
      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
      
      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
			
			rinvsix      = _mm_mul_sd(rinvsq,rinvsq);
			rinvsix      = _mm_mul_sd(rinvsix,rinvsq);
			
			vvdw6        = _mm_mul_sd(c6,rinvsix);
			vvdw12       = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
			vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6)); /* (**) */
      
      fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
                                           _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
                                                      _mm_mul_sd(six,vvdw6))),
                                _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
      
      /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
      
      /* Calculate temporary vectorial force */
      tx           = _mm_mul_sd(fscal,dx);
      ty           = _mm_mul_sd(fscal,dy);
      tz           = _mm_mul_sd(fscal,dz);
      
      /* Increment i atom force */
      fix          = _mm_add_sd(fix,tx);
      fiy          = _mm_add_sd(fiy,ty);
      fiz          = _mm_add_sd(fiz,tz);
      
      /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
		}
		
    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
    
    ggid     = gid[n];         
    
    gmx_mm_update_1pot_pd(vctot,vc+ggid);
    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
    
	}
  
	*outeriter   = nri;            
  *inneriter   = nj1; 	
}
예제 #11
0
AABB3d TriangleItemHandler::clip(
    const size_t                        item_index,
    const size_t                        dimension,
    const double                        slab_min,
    const double                        slab_max) const
{
    const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index];

    if (vertex_info.m_motion_segment_count > 0)
    {
        AABB3d triangle_bbox = m_triangle_bboxes[item_index];

        if (triangle_bbox.min[dimension] < slab_min)
            triangle_bbox.min[dimension] = slab_min;

        if (triangle_bbox.max[dimension] > slab_max)
            triangle_bbox.max[dimension] = slab_max;

        return triangle_bbox;
    }

#ifdef APPLESEED_USE_SSE

    APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const double v0d = v0[dimension];
    const double v1d = v1[dimension];
    const double v2d = v2[dimension];

    const int v0_ge_min = v0d >= slab_min ? 1 : 0;
    const int v0_le_max = v0d <= slab_max ? 1 : 0;
    const int v1_ge_min = v1d >= slab_min ? 1 : 0;
    const int v1_le_max = v1d <= slab_max ? 1 : 0;
    const int v2_ge_min = v2d >= slab_min ? 1 : 0;
    const int v2_le_max = v2d <= slab_max ? 1 : 0;

    __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max());
    __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max());

    const __m128d v0_xy = _mm_load_pd(&v0.x);
    const __m128d v0_zz = _mm_set1_pd(v0.z);
    const __m128d v1_xy = _mm_load_pd(&v1.x);
    const __m128d v1_zz = _mm_set1_pd(v1.z);
    const __m128d v2_xy = _mm_load_pd(&v2.x);
    const __m128d v2_zz = _mm_set1_pd(v2.z);

    if (v0_ge_min & v0_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz);
    }

    if (v1_ge_min & v1_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz);
    }

    if (v2_ge_min & v2_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz);
    }

    const int v0v1_cross_min = v0_ge_min ^ v1_ge_min;
    const int v0v1_cross_max = v0_le_max ^ v1_le_max;
    const int v1v2_cross_min = v1_ge_min ^ v2_ge_min;
    const int v1v2_cross_max = v1_le_max ^ v2_le_max;
    const int v2v0_cross_min = v2_ge_min ^ v0_ge_min;
    const int v2v0_cross_max = v2_le_max ^ v0_le_max;

    if (v0v1_cross_min | v0v1_cross_max)
    {
        const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]);

        if (v0v1_cross_min)
        {
            const double t = (slab_min - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v0v1_cross_max)
        {
            const double t = (slab_max - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v1v2_cross_min | v1v2_cross_max)
    {
        const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]);

        if (v1v2_cross_min)
        {
            const double t = (slab_min - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v1v2_cross_max)
        {
            const double t = (slab_max - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v2v0_cross_min | v2v0_cross_max)
    {
        const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]);

        if (v2v0_cross_min)
        {
            const double t = (slab_min - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v2v0_cross_max)
        {
            const double t = (slab_max - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    APPLESEED_SIMD4_ALIGN AABB3d bbox;

    _mm_store_pd(&bbox.min.x, bbox_min_xy);
    _mm_store_sd(&bbox.min.z, bbox_min_zz);
    _mm_storeu_pd(&bbox.max.x, bbox_max_xy);
    _mm_store_sd(&bbox.max.z, bbox_max_zz);

    if (bbox.min[dimension] < slab_min)
        bbox.min[dimension] = slab_min;

    if (bbox.max[dimension] > slab_max)
        bbox.max[dimension] = slab_max;

#else

    const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0;
    const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0;
    const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0;
    const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0;
    const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0;
    const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0;

    AABB3d bbox;
    bbox.invalidate();

    if (v0_ge_min & v0_le_max)
        bbox.insert(v0);

    if (v1_ge_min & v1_le_max)
        bbox.insert(v1);

    if (v2_ge_min & v2_le_max)
        bbox.insert(v2);

    if (v0_ge_min != v1_ge_min)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min));

    if (v0_le_max != v1_le_max)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max));

    if (v1_ge_min != v2_ge_min)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min));

    if (v1_le_max != v2_le_max)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max));

    if (v2_ge_min != v0_ge_min)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min));

    if (v2_le_max != v0_le_max)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max));

#endif

    return bbox;
}
예제 #12
0
// =============================================================================
//
// sse3_vChirpData
// version by: Alex Kan
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse3_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;
#ifdef USE_MANUAL_CALLSTACK
  call_stack.enter("sse3_ChirpData_ak()");
#endif 

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
#ifdef USE_MANUAL_CALLSTACK
    call_stack.exit();
#endif 
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate);
    a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate);

    // reduce the angle to the range (-0.5, 0.5)
    a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
    a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4),
                                    SS3),
                                y),
                          SS2),
                    y),
              SS1),
          x);
    c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3),
                                CC2),
                          y),
                    CC1),
              y),
          ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    s = _mm_mul_ps(s, m);
    c = _mm_mul_ps(c, m);

    // chirp the data
    cd1 = _mm_shuffle_ps(c, c, 0x50);
    cd2 = _mm_shuffle_ps(c, c, 0xfa);
    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    d1 = _mm_shuffle_ps(d1, d1, 0xb1);
    d2 = _mm_shuffle_ps(d2, d2, 0xb1);
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);
    td1 = _mm_mul_ps(td1, d1);
    td2 = _mm_mul_ps(td2, d2);
    cd1 = _mm_addsub_ps(cd1, td1);
    cd2 = _mm_addsub_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  // handle tail elements with scalar code
  for (   ; i < ul_NumDataPoints; ++i) {
    double angle = srate * i * i * 0.5;
    double s = sin(angle);
    double c = cos(angle);
    float re = cx_DataArray[i][0];
    float im = cx_DataArray[i][1];

    cx_ChirpDataArray[i][0] = re * c - im * s;
    cx_ChirpDataArray[i][1] = re * s + im * c;
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

#ifdef USE_MANUAL_CALLSTACK
  call_stack.exit();
#endif 
  return 0;
}
예제 #13
0
int sse3_ChirpData_ak8(
    sah_complex * cx_DataArray,
    sah_complex * cx_ChirpDataArray,
    int chirp_rate_ind,
    double chirp_rate,
    int  ul_NumDataPoints,
    double sample_rate
) {
#ifdef USE_MANUAL_CALLSTACK
    call_stack.enter("sse3_ChirpData_ak8()");
#endif 
    int i;

    if (chirp_rate_ind == 0) {
      memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
#ifdef USE_MANUAL_CALLSTACK
      call_stack.exit();
#endif 
      return 0;
    }

    int vEnd;
    double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
    __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
    __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);
    __m128d DFOUR = _mm_set_pd(4.0, 4.0);


    // main vectorised loop
    vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
    __m128d di1 = _mm_set_pd(2.0, 0.0);                 // set time patterns for eventual moveldup/movehdup
    __m128d di2 = _mm_set_pd(3.0, 1.0);

    for (i = 0; i < vEnd; i += 4) {
      const float *d = (const float *) (cx_DataArray + i);
      float *cd = (float *) (cx_ChirpDataArray + i);

      __m128d a1, a2;

      __m128 d1, d2;
      __m128 cd1, cd2;
      __m128 td1, td2;

      __m128 x;
      __m128 y;
      __m128 z;
      __m128 s;
      __m128 c;
      __m128 m;

      // load the signal to be chirped
      d1 = _mm_load_ps(d);
      d2 = _mm_load_ps(d+4);

      // calculate the input angle
      a1 = _mm_mul_pd(_mm_mul_pd(di1, di1), rate);
      a2 = _mm_mul_pd(_mm_mul_pd(di2, di2), rate);

      // update times for next
      di1 = _mm_add_pd(di1, DFOUR);
      di2 = _mm_add_pd(di2, DFOUR);

      // reduce the angle to the range (-0.5, 0.5)
      a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
      a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

      // convert pair of packed double into packed single
      x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));               // 3   1   2   0

      // square to the range [0, 0.25)
      y = _mm_mul_ps(x, x);

      // perform the initial polynomial approximations, Estrin's method
      z = _mm_mul_ps(y, y);

      s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4F),
                                                      SS3F),
                                           z),
                                _mm_add_ps(_mm_mul_ps(y, SS2F),
                                           SS1F)),
                     x);
      c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3F),
                                           CC2F),
                                z),
                     _mm_add_ps(_mm_mul_ps(y, CC1F),
                                ONE));

      // perform first angle doubling
      x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
      y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

      // calculate scaling factor to correct the magnitude
      m = _mm_sub_ps(_mm_sub_ps(TWO, _mm_mul_ps(x, x)), _mm_mul_ps(y, y));

      // perform second angle doubling
      c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
      s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

      // correct the magnitude (final sine / cosine approximations)
      c = _mm_mul_ps(c, m);                                       // c3    c1    c2    c0
      s = _mm_mul_ps(s, m);

      // chirp the data
      cd1 = _mm_moveldup_ps(c);                                   // c1    c1    c0    c0
      cd2 = _mm_movehdup_ps(c);                                   // c3    c3    c2    c2
      cd1 = _mm_mul_ps(cd1, d1);                                  // c1.i1 c1.r1 c0.i0 c0.r0
      cd2 = _mm_mul_ps(cd2, d2);                                  // c3.i3 c3.r3 c2.i2 c2.r2
      d1 = _mm_shuffle_ps(d1, d1, 0xb1);
      d2 = _mm_shuffle_ps(d2, d2, 0xb1);
      td1 = _mm_moveldup_ps(s);
      td2 = _mm_movehdup_ps(s);
      td1 = _mm_mul_ps(td1, d1);
      td2 = _mm_mul_ps(td2, d2);
      cd1 = _mm_addsub_ps(cd1, td1);
      cd2 = _mm_addsub_ps(cd2, td2);

      // store chirped values
      _mm_stream_ps(cd, cd1);
      _mm_stream_ps(cd+4, cd2);
    }

    // handle tail elements with scalar code
    for (; i < ul_NumDataPoints; ++i) {
      double angle = srate * i * i * 0.5;
      double s = sin(angle);
      double c = cos(angle);

      float re = cx_DataArray[i][0];
      float im = cx_DataArray[i][1];

      cx_ChirpDataArray[i][0] = re * c - im * s;
      cx_ChirpDataArray[i][1] = re * s + im * c;
    }
    analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;
#ifdef USE_MANUAL_CALLSTACK
    call_stack.exit();
#endif 
    return 0;
}
예제 #14
0
/* mix local carrier -----------------------------------------------------------
* mix local carrier to data
* args   : char   *data     I   data
*          int    dtype     I   data type (0:real,1:complex)
*          double ti        I   sampling interval (s)
*          int    n         I   number of samples
*          double freq      I   carrier frequency (Hz)
*          double phi0      I   initial phase (rad)
*          short  *I,*Q     O   carrier mixed data I, Q component
* return : double               phase remainder
*-----------------------------------------------------------------------------*/
extern double mixcarr(const char *data, int dtype, double ti, int n, 
                      double freq, double phi0, short *II, short *QQ)
{
    const char *p;
    double phi,ps,prem;

#if !defined(SSE2_ENABLE)
    static short cost[CDIV]={0},sint[CDIV]={0};
    int i,index;

    /* initialize local carrier table */
    if (!cost[0]) { 
        for (i=0;i<CDIV;i++) {
            cost[i]=(short)floor((cos(DPI/CDIV*i)/CSCALE+0.5));
            sint[i]=(short)floor((sin(DPI/CDIV*i)/CSCALE+0.5));
        }
    }
    phi=phi0*CDIV/DPI;
    ps=freq*CDIV*ti; /* phase step */

    if (dtype==DTYPEIQ) { /* complex */
        for (p=data;p<data+n*2;p+=2,II++,QQ++,phi+=ps) {
            index=((int)phi)&CMASK;
            *II=cost[index]*p[0]-sint[index]*p[1];
            *QQ=sint[index]*p[0]+cost[index]*p[1];
        }
    }
    if (dtype==DTYPEI) { /* real */
        for (p=data;p<data+n;p++,II++,QQ++,phi+=ps) {
            index=((int)phi)&CMASK;
            *II=cost[index]*p[0];
            *QQ=sint[index]*p[0];
        }
    }
    prem=phi*DPI/CDIV;
    while(prem>DPI) prem-=DPI;
    return prem;
#else
    static char cost[16]={0},sint[16]={0};
    short I1[16]={0},I2[16]={0},Q1[16]={0},Q2[16]={0};
    int i;
    __m128d xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm9;
    __m128i dat1,dat2,dat3,dat4,ind1,ind2,xcos,xsin;
    __m128i zero=_mm_setzero_si128();
    __m128i mask4=_mm_set1_epi32(15);
    __m128i mask8=_mm_set1_epi16(255);

    if (!cost[0]) {
        for (i=0;i<16;i++) {
            cost[i]=(char)floor((cos(DPI/16*i)/CSCALE+0.5));
            sint[i]=(char)floor((sin(DPI/16*i)/CSCALE+0.5));
        }
    }
    phi=phi0/DPI*16-floor(phi0/DPI)*16;
    ps=freq*16*ti;
    xmm1=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm2=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm3=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm4=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm5=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm6=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm7=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm8=_mm_set_pd(phi+ps,phi); phi+=ps*2;
    xmm9=_mm_set1_pd(ps*16);
    xcos=_mm_loadu_si128((__m128i *)cost);
    xsin=_mm_loadu_si128((__m128i *)sint);

    if (dtype==DTYPEIQ) { /* complex */
        for (p=data;p<data+n*2;p+=32,II+=16,QQ+=16) {
            LOAD_INT8C(dat1,dat2,p   ,zero,mask8);
            LOAD_INT8C(dat3,dat4,p+16,zero,mask8);

            DBLTOINT16(ind1,xmm1,xmm2,xmm3,xmm4,mask4);
            DBLTOINT16(ind2,xmm5,xmm6,xmm7,xmm8,mask4);
            ind1=_mm_packus_epi16(ind1,ind2);
            MIX_INT8(I1,dat1,dat3,xcos,ind1,zero);
            MIX_INT8(I2,dat1,dat3,xsin,ind1,zero);
            MIX_INT8(Q1,dat2,dat4,xsin,ind1,zero);
            MIX_INT8(Q2,dat2,dat4,xcos,ind1,zero);
            for (i=0;i<16;i++) {
                II[i]=I1[i]-Q1[i];
                QQ[i]=I2[i]+Q2[i];
            }
            xmm1=_mm_add_pd(xmm1,xmm9);
            xmm2=_mm_add_pd(xmm2,xmm9);
            xmm3=_mm_add_pd(xmm3,xmm9);
            xmm4=_mm_add_pd(xmm4,xmm9);
            xmm5=_mm_add_pd(xmm5,xmm9);
            xmm6=_mm_add_pd(xmm6,xmm9);
            xmm7=_mm_add_pd(xmm7,xmm9);
            xmm8=_mm_add_pd(xmm8,xmm9);
        }
    }
    if (dtype==DTYPEI) { /* real */
        for (p=data;p<data+n;p+=16,II+=16,QQ+=16) {
            LOAD_INT8(dat1,dat2,p,zero);

            DBLTOINT16(ind1,xmm1,xmm2,xmm3,xmm4,mask4);
            DBLTOINT16(ind2,xmm5,xmm6,xmm7,xmm8,mask4);
            ind1=_mm_packus_epi16(ind1,ind2);
            MIX_INT8(II,dat1,dat2,xcos,ind1,zero);
            MIX_INT8(QQ,dat1,dat2,xsin,ind1,zero);
            xmm1=_mm_add_pd(xmm1,xmm9);
            xmm2=_mm_add_pd(xmm2,xmm9);
            xmm3=_mm_add_pd(xmm3,xmm9);
            xmm4=_mm_add_pd(xmm4,xmm9);
            xmm5=_mm_add_pd(xmm5,xmm9);
            xmm6=_mm_add_pd(xmm6,xmm9);
            xmm7=_mm_add_pd(xmm7,xmm9);
            xmm8=_mm_add_pd(xmm8,xmm9);
        }
    }
    prem=phi0+freq*ti*n*DPI;
    while(prem>DPI) prem-=DPI;
    return prem;
#endif
}
예제 #15
0
// a = b^{\dagger} * c^{\dagger}
void 
multab_dagdag( GLU_complex a[ NCNC ] , 
	       const GLU_complex b[ NCNC ] , 
	       const GLU_complex c[ NCNC ] )
{
  // recast to alignment
  __m128d *A = (__m128d*)a ;
  const __m128d *B = (const __m128d*)b ;
  const __m128d *C = (const __m128d*)c ;
#if NC==3
  // a[0] = conj( b[0] ) * conj( c[0] ) + conj( b[3] ) * conj( c[1] ) + conj( b[6] ) * conj( c[2] ) ;
  *( A + 0 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 0 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 3 ) , *( C + 1 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 6 ) , *( C + 2 ) ) ) ) ;
  //a[1] = conj( b[0] ) * conj( c[3] ) + conj( b[3] ) * conj( c[4] ) + conj( b[6] ) * conj( c[5] ) ; 
  *( A + 1 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 0 ) , *( C + 3 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 3 ) , *( C + 4 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 6 ) , *( C + 5 ) ) ) ) ;
  //a[2] = conj( b[0] ) * conj( c[6] ) + conj( b[3] ) * conj( c[7] ) + conj( b[6] ) * conj( c[8] ) ; 
  *( A + 2 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 0 ) , *( C + 6 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 3 ) , *( C + 7 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 6 ) , *( C + 8 ) ) ) ) ;
  // middle row //
  //a[3] = conj( b[1] ) * conj( c[0] ) + conj( b[4] ) * conj( c[1] ) + conj( b[7] ) * conj( c[2] ) ; 
  *( A + 3 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 1 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 4 ) , *( C + 1 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 7 ) , *( C + 2 ) ) ) ) ;
  //a[4] = conj( b[1] ) * conj( c[3] ) + conj( b[4] ) * conj( c[4] ) + conj( b[7] ) * conj( c[5] ) ; 
  *( A + 4 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 1 ) , *( C + 3 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 4 ) , *( C + 4 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 7 ) , *( C + 5 ) ) ) ) ;
  //a[5] = conj( b[1] ) * conj( c[6] ) + conj( b[4] ) * conj( c[7] ) + conj( b[7] ) * conj( c[8] ) ; 
  *( A + 5 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 1 ) , *( C + 6 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 4 ) , *( C + 7 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 7 ) , *( C + 8 ) ) ) ) ;
  //a[6] = conj( b[2] ) * conj( c[0] ) + conj( b[5] ) * conj( c[1] ) + conj( b[8] ) * conj( c[2] ) ;
  *( A + 6 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 2 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 5 ) , *( C + 1 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 8 ) , *( C + 2 ) ) ) ) ;
  //a[7] = conj( b[2] ) * conj( c[3] ) + conj( b[5] ) * conj( c[4] ) + conj( b[8] ) * conj( c[5] ) ;
  *( A + 7 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 2 ) , *( C + 3 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 5 ) , *( C + 4 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 8 ) , *( C + 5 ) ) ) ) ;
  //a[8] = conj( b[2] ) * conj( c[6] ) + conj( b[5] ) * conj( c[7] ) + conj( b[8] ) * conj( c[8] ) ; 
  *( A + 8 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 2 ) , *( C + 6 ) ) ,
			   _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 5 ) , *( C + 7 ) ) ,
				       SSE2_MUL_CONJCONJ( *( B + 8 ) , *( C + 8 ) ) ) ) ;
#elif NC==2
  //a[0] = conj( b[0] ) * conj( c[0] ) + conj( b[2] ) * conj( c[1] ) ;
  *( A + 0 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 0 ) , *( C + 0 ) ) ,
			   SSE2_MUL_CONJCONJ( *( B + 2 ) , *( C + 1 ) ) ) ;
  //a[1] = conj( b[0] ) * conj( c[2] ) + conj( b[2] ) * conj( c[3] ) ;
  *( A + 1 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 0 ) , *( C + 2 ) ) ,
			   SSE2_MUL_CONJCONJ( *( B + 2 ) , *( C + 3 ) ) ) ;
  //a[2] = conj( b[1] ) * conj( c[0] ) + conj( b[3] ) * conj( c[1] ) ; 
  *( A + 2 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 1 ) , *( C + 0 ) ) ,
			   SSE2_MUL_CONJCONJ( *( B + 3 ) , *( C + 1 ) ) ) ;
  //a[3] = conj( b[1] ) * conj( c[2] ) + conj( b[3] ) * conj( c[3] ) ; 
  *( A + 3 ) = _mm_add_pd( SSE2_MUL_CONJCONJ( *( B + 1 ) , *( C + 2 ) ) ,
			   SSE2_MUL_CONJCONJ( *( B + 3 ) , *( C + 3 ) ) ) ;
#else
  size_t i , j , m ;
  register __m128d sum ;
  const __m128d *pC , *pB ;
  for( i = 0 ; i < NC ; i++ ) {
    pC = C ;
    for( j = 0 ; j < NC ; j++ ) {
      pB = B ;
      sum = _mm_setzero_pd( ) ;
      for( m = 0 ; m < NC ; m++ ) {
	sum = _mm_add_pd( sum , SSE2_MUL_CONJCONJ( pB[i] , pC[m] ) ) ;
	pB += NC ;
      }
      *A++ = sum ;
      pC += NC ;
    }
  }
#endif
  return ;
}
예제 #16
0
// it moves horizontally inside a block
void kernel_dgemv_n_2_lib4(int kmax, double *A, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m128d
		ax_temp,
		a_00_10, a_01_11, a_02_12, a_03_13,
		x_0, x_1, x_2, x_3,
		y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1;
	
	y_0_1 = _mm_setzero_pd();	
	y_0_1_b = _mm_setzero_pd();	
	y_0_1_c = _mm_setzero_pd();	
	y_0_1_d = _mm_setzero_pd();	

	k=0;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm_loaddup_pd( &x[0] );
		x_1 = _mm_loaddup_pd( &x[1] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );
		a_01_11 = _mm_load_pd( &A[0+lda*1] );

		x_2 = _mm_loaddup_pd( &x[2] );
		x_3 = _mm_loaddup_pd( &x[3] );

		a_02_12 = _mm_load_pd( &A[0+lda*2] );
		a_03_13 = _mm_load_pd( &A[0+lda*3] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );
		ax_temp = _mm_mul_pd( a_01_11, x_1 );
		y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp );

		ax_temp = _mm_mul_pd( a_02_12, x_2 );
		y_0_1_c = _mm_add_pd( y_0_1_c, ax_temp );
		ax_temp = _mm_mul_pd( a_03_13, x_3 );
		y_0_1_d = _mm_add_pd( y_0_1_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}

	y_0_1 = _mm_add_pd( y_0_1, y_0_1_c );
	y_0_1_b = _mm_add_pd( y_0_1_b, y_0_1_d );

	if(kmax%4>=2)
		{

		x_0 = _mm_loaddup_pd( &x[0] );
		x_1 = _mm_loaddup_pd( &x[1] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );
		a_01_11 = _mm_load_pd( &A[0+lda*1] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );
		ax_temp = _mm_mul_pd( a_01_11, x_1 );
		y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1 = _mm_add_pd( y_0_1, y_0_1_b );

	if(kmax%2==1)
		{

		x_0 = _mm_loaddup_pd( &x[0] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );

		}

	if(alg==0)
		{
		_mm_storeu_pd(&y[0], y_0_1);
		}
	else if(alg==1)
		{
		z_0_1 = _mm_loadu_pd( &y[0] );

		z_0_1 = _mm_add_pd( z_0_1, y_0_1 );

		_mm_storeu_pd(&y[0], z_0_1);
		}
	else // alg==-1
		{
		z_0_1 = _mm_loadu_pd( &y[0] );

		z_0_1 = _mm_sub_pd( z_0_1, y_0_1 );

		_mm_storeu_pd(&y[0], z_0_1);
		}

	}
예제 #17
0
mlib_status
__mlib_SignalLMSFilter_F32S_F32S(
    mlib_f32 *dst,
    const mlib_f32 *src,
    const mlib_f32 *ref,
    void *filter,
    mlib_s32 n)
{
	LMS_Filter *pLMS_Filter = (LMS_Filter *) filter;
	mlib_d64 *W[2], *srcBuffer[2];
	mlib_d64 U = pLMS_Filter->U;
	mlib_d64 D, Y;
	mlib_s32 i, t, j, off, count;
	mlib_s32 tap, bufferSize;
	mlib_f32 *psrc, *pdst, *pref;

	__m128d srcs1, srcs2, srcs3, srcs4;
	__m128d SBE0, SD, temp;
	__m128d *SW;

	if (src == NULL || dst == NULL || ref == NULL)
		return (MLIB_NULLPOINTER);

	if (n <= 0)
		return (MLIB_FAILURE);

	tap = pLMS_Filter->tap;
	bufferSize = pLMS_Filter->bufferSize;

	SW = __mlib_malloc(tap * sizeof (__m128d));

	W[0] = pLMS_Filter->data;
	srcBuffer[0] = (W[0] + 2 * tap);
	W[1] = srcBuffer[0] + 2 * bufferSize;
	srcBuffer[1] = (W[1] + 2 * tap);

	psrc = (mlib_f32 *)src;
	pdst = (mlib_f32 *)dst;
	pref = (mlib_f32 *)ref;

	for (off = 0; off < n; off += count) {

		count = n - off;
		if (count > bufferSize) {
			count = bufferSize;
		}

		for (i = 0; i < count; i++) {
			srcBuffer[0][i] = (mlib_d64)psrc[2 * i];
			srcBuffer[1][i] = (mlib_d64)psrc[2 * i + 1];
		}

		for (j = 0; j < 2; j++) {
			mlib_d64 *psrcBuffer = srcBuffer[j];
			mlib_d64 *pW = W[j];
			mlib_d64 BE0 = pLMS_Filter->BE[j];

			for (i = 0; i < (tap / 2); i++) {
				SW[i] = _mm_set_pd(pW[i * 2], pW[i * 2 + 1]);
			}

			for (t = 0; t < count; t++) {
				D = 0;
				Y = pref[2 * t + j];
				SBE0 = _mm_set1_pd(BE0);
				SD = _mm_setzero_pd();
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i < (tap / 2); i++) {
					srcs1 = _mm_loadu_pd(
					    psrcBuffer + t - i * 2 - 2);
					srcs2 = _mm_loadu_pd(
					    psrcBuffer + t - i * 2 - 1);

					temp = _mm_mul_pd(SBE0, srcs1);
					SW[i] = _mm_add_pd(SW[i], temp);
					SD = _mm_add_pd(SD,
					    _mm_mul_pd(srcs2, SW[i]));
				}

				mlib_d64 TD[2];
				_mm_storeu_pd(TD, SD);
				D += TD[0] + TD[1];

				for (i = i * 2; i < tap; i++) {
					pW[i] += BE0 * psrcBuffer[t - i - 1];
					D += psrcBuffer[t - i] * pW[i];
				}

				BE0 = (Y - D) * U;

				pdst[2 * t + j] = (mlib_f32)D;
			}

			mlib_d64 TW[2];
			for (i = 0; i < (tap / 2); i++) {
				_mm_storeu_pd(TW, SW[i]);
				pW[i * 2] = TW[1];
				pW[i * 2 + 1] = TW[0];
			}

			for (i = 0; i < tap; i++) {
				psrcBuffer[i - tap] =
				    psrcBuffer[count + (i - tap)];
			}

			pLMS_Filter->BE[j] = BE0;
		}

		psrc += 2 * count;
		pdst += 2 * count;
		pref += 2 * count;
	}

	__mlib_free(SW);
	return (MLIB_SUCCESS);
}
예제 #18
0
// it moves horizontally inside a block
void kernel_dgemv_n_1_lib4(int kmax, double *A, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m128d
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0, x_1, x_2, x_3,
		y_0, y_0_b, y_0_c, y_0_d, z_0;
	
	y_0 = _mm_setzero_pd();	
	y_0_b = _mm_setzero_pd();	
	y_0_c = _mm_setzero_pd();	
	y_0_d = _mm_setzero_pd();	

	k=0;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm_load_sd( &x[0] );
		x_1 = _mm_load_sd( &x[1] );

		a_00 = _mm_load_sd( &A[0+lda*0] );
		a_01 = _mm_load_sd( &A[0+lda*1] );

		x_2 = _mm_load_sd( &x[2] );
		x_3 = _mm_load_sd( &x[3] );

		a_02 = _mm_load_sd( &A[0+lda*2] );
		a_03 = _mm_load_sd( &A[0+lda*3] );

/*		y_0 += a_00 * x_0;*/
		ax_temp = _mm_mul_sd( a_00, x_0 );
		y_0 = _mm_add_sd( y_0, ax_temp );
/*		y_0 += a_01 * x_1;*/
		ax_temp = _mm_mul_sd( a_01, x_1 );
		y_0_b = _mm_add_sd( y_0_b, ax_temp );

/*		y_0 += a_02 * x_2;*/
		ax_temp = _mm_mul_sd( a_02, x_2 );
		y_0_c = _mm_add_sd( y_0_c, ax_temp );
/*		y_0 += a_03 * x_3;*/
		ax_temp = _mm_mul_sd( a_03, x_3 );
		y_0_d = _mm_add_sd( y_0_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}

	y_0 = _mm_add_pd( y_0, y_0_c );
	y_0_b = _mm_add_pd( y_0_b, y_0_d );

	if(kmax%4>=2)
		{

		x_0 = _mm_load_sd( &x[0] );
		x_1 = _mm_load_sd( &x[1] );

		a_00 = _mm_load_sd( &A[0+lda*0] );
		a_01 = _mm_load_sd( &A[0+lda*1] );

/*		y_0 += a_00 * x_0;*/
		ax_temp = _mm_mul_sd( a_00, x_0 );
		y_0 = _mm_add_sd( y_0, ax_temp );
/*		y_0 += a_01 * x_1;*/
		ax_temp = _mm_mul_sd( a_01, x_1 );
		y_0_b = _mm_add_sd( y_0_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0 = _mm_add_pd( y_0, y_0_b );

	if(kmax%2==1)
		{

		x_0 = _mm_load_sd( &x[0] );

		a_00 = _mm_load_sd( &A[0+lda*0] );

/*		y_0 += a_00 * x_0;*/
		ax_temp = _mm_mul_sd( a_00, x_0 );
		y_0 = _mm_add_sd( y_0, ax_temp );
		
/*		A += 1*lda;*/
/*		x += 1;*/

		}

	if(alg==0)
		{
		_mm_store_sd(&y[0], y_0);
		}
	else if(alg==1)
		{
		z_0 = _mm_load_sd( &y[0] );

/*		z_0 += y_0;*/
		z_0 = _mm_add_sd( z_0, y_0 );

		_mm_store_sd(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm_load_sd( &y[0] );

/*		z_0 -= y_0;*/
		z_0 = _mm_sub_sd( z_0, y_0 );

		_mm_store_sd(&y[0], z_0);
		}

	}
예제 #19
0
void nb_kernel430_x86_64_sse2(int *           p_nri,
                              int *           iinr,
                              int *           jindex,
                              int *           jjnr,
                              int *           shift,
                              double *         shiftvec,
                              double *         fshift,
                              int *           gid,
                              double *         pos,
                              double *         faction,
                              double *         charge,
                              double *         p_facel,
                              double *         p_krf,
                              double *         p_crf,
                              double *         vc,
                              int *           type,
                              int *           p_ntype,
                              double *         vdwparam,
                              double *         vvdw,
                              double *         p_tabscale,
                              double *         VFtab,
                              double *         invsqrta,
                              double *         dvda,
                              double *         p_gbtabscale,
                              double *         GBtab,
                              int *           p_nthreads,
                              int *           count,
                              void *          mtx,
                              int *           outeriter,
                              int *           inneriter,
                              double *         work)
{
    int           nri,ntype,nthreads;
    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
    double        shX,shY,shZ;
	int			  offset,nti;
    int           jnrA,jnrB;
    int           j3A,j3B;
	int           tjA,tjB;
	gmx_gbdata_t *gbdata;
	double *      gpol;
    
	__m128d  iq,qq,jq,isai;
	__m128d  ix,iy,iz;
	__m128d  jx,jy,jz;
	__m128d  dx,dy,dz;
	__m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
	__m128d  fix,fiy,fiz,tx,ty,tz,rsq;
	__m128d  rinv,isaj,isaprod;
	__m128d  vcoul,fscal,gbscale,c6,c12;
	__m128d  rinvsq,r,rtab;
	__m128d  eps,Y,F,G,H;
    __m128d  VV,FF,Fp;
	__m128d  vgb,fijGB,dvdatmp;
	__m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
	__m128d  facel,gbtabscale,dvdaj;
    __m128d  fijD,fijR;
    __m128d  xmm1,tabscale,eps2;
	__m128i  n0, nnn;
    
	
	const __m128d neg        = _mm_set1_pd(-1.0);
	const __m128d zero       = _mm_set1_pd(0.0);
	const __m128d minushalf  = _mm_set1_pd(-0.5);
	const __m128d two        = _mm_set1_pd(2.0);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;
    
	nri        = *p_nri;
	ntype      = *p_ntype;
    
    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
    gbtabscale = _mm_load1_pd(p_gbtabscale);  
    facel      = _mm_load1_pd(p_facel);
    tabscale   = _mm_load1_pd(p_tabscale);
    
    nj1         = 0;
    jnrA = jnrB = 0;
    j3A = j3B   = 0;
    jx          = _mm_setzero_pd();
    jy          = _mm_setzero_pd();
    jz          = _mm_setzero_pd();
    c6          = _mm_setzero_pd();
    c12         = _mm_setzero_pd();
	
	for(n=0;n<nri;n++)
	{
        is3              = 3*shift[n];     
        shX              = shiftvec[is3];  
        shY              = shiftvec[is3+1];
        shZ              = shiftvec[is3+2];
        nj0              = jindex[n];      
        nj1              = jindex[n+1];    
        ii               = iinr[n];        
        ii3              = 3*ii;           
		
		ix               = _mm_set1_pd(shX+pos[ii3+0]);
		iy               = _mm_set1_pd(shY+pos[ii3+1]);
		iz               = _mm_set1_pd(shZ+pos[ii3+2]);
        
		iq               = _mm_load1_pd(charge+ii);
		iq               = _mm_mul_pd(iq,facel);
        
		isai             = _mm_load1_pd(invsqrta+ii);
        
		nti              = 2*ntype*type[ii];
		
		vctot            = _mm_setzero_pd();
		vvdwtot          = _mm_setzero_pd();
		vgbtot           = _mm_setzero_pd();
		dvdasum          = _mm_setzero_pd();
		fix              = _mm_setzero_pd();
		fiy              = _mm_setzero_pd();
		fiz              = _mm_setzero_pd();
        
		for(k=nj0;k<nj1-1; k+=2)
		{
			jnrA    = jjnr[k];
			jnrB    = jjnr[k+1];
			
			j3A     = jnrA * 3;
			j3B     = jnrB * 3;
            
            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
            
			dx           = _mm_sub_pd(ix,jx);
			dy           = _mm_sub_pd(iy,jy);
			dz           = _mm_sub_pd(iz,jz);
            
            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
            
            rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_pd(rinv,rinv);
            
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
			GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
            
            /* Lennard-Jones */
            tjA          = nti+2*type[jnrA];
			tjB          = nti+2*type[jnrB];
            
            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
			
			isaprod      = _mm_mul_pd(isai,isaj);
			qq           = _mm_mul_pd(iq,jq);            
			vcoul        = _mm_mul_pd(qq,rinv);
			fscal        = _mm_mul_pd(vcoul,rinv);                                 
            vctot        = _mm_add_pd(vctot,vcoul);
            
            /* Polarization interaction */
			qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
			gbscale      = _mm_mul_pd(isaprod,gbtabscale);
            
 			/* Calculate GB table index */
			r            = _mm_mul_pd(rsq,rinv);
			rtab         = _mm_mul_pd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
            /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_pd(G,eps);
            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
            vgb     = _mm_mul_pd(Y, qq);           
            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
            
            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
            
            vgbtot  = _mm_add_pd(vgbtot, vgb);
            
            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
            
            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
			
            /* Calculate VDW table index */
			rtab    = _mm_mul_pd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rtab);
			eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			eps2    = _mm_mul_pd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			vvdw6   = _mm_mul_pd(c6,VV);
			fijD    = _mm_mul_pd(c6,FF);
            
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			vvdw12  = _mm_mul_pd(c12,VV);
			fijR    = _mm_mul_pd(c12,FF);
			
			vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
			vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
            
			xmm1    = _mm_add_pd(fijD,fijR);
			xmm1    = _mm_mul_pd(xmm1,tabscale);
			xmm1    = _mm_add_pd(xmm1,fijGB);
			xmm1    = _mm_sub_pd(xmm1,fscal);
			fscal   = _mm_mul_pd(xmm1,neg);
			fscal   = _mm_mul_pd(fscal,rinv);
            
            /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
            
            /* Calculate temporary vectorial force */
            tx           = _mm_mul_pd(fscal,dx);
            ty           = _mm_mul_pd(fscal,dy);
            tz           = _mm_mul_pd(fscal,dz);
            
            /* Increment i atom force */
            fix          = _mm_add_pd(fix,tx);
            fiy          = _mm_add_pd(fiy,ty);
            fiz          = _mm_add_pd(fiz,tz);
            
            /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(k<nj1)
		{
			jnrA    = jjnr[k];
			j3A     = jnrA * 3;
            
            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
            
			dx           = _mm_sub_sd(ix,jx);
			dy           = _mm_sub_sd(iy,jy);
			dz           = _mm_sub_sd(iz,jz);
            
            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
            
            rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_sd(rinv,rinv);
            
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
			GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
            
            /* Lennard-Jones */
            tjA          = nti+2*type[jnrA];
            
            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
			
			isaprod      = _mm_mul_sd(isai,isaj);
			qq           = _mm_mul_sd(iq,jq);            
			vcoul        = _mm_mul_sd(qq,rinv);
			fscal        = _mm_mul_sd(vcoul,rinv);                                 
            vctot        = _mm_add_sd(vctot,vcoul);
            
            /* Polarization interaction */
			qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
			gbscale      = _mm_mul_sd(isaprod,gbtabscale);
            
 			/* Calculate GB table index */
			r            = _mm_mul_sd(rsq,rinv);
			rtab         = _mm_mul_sd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
            /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_sd(G,eps);
            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
            vgb     = _mm_mul_sd(Y, qq);           
            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
            
            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
            
            vgbtot  = _mm_add_sd(vgbtot, vgb);
            
            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
            
            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
			
            /* Calculate VDW table index */
			rtab    = _mm_mul_sd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rtab);
			eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			eps2    = _mm_mul_sd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			vvdw6   = _mm_mul_sd(c6,VV);
			fijD    = _mm_mul_sd(c6,FF);
            
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
            F            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
            H            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			vvdw12  = _mm_mul_sd(c12,VV);
			fijR    = _mm_mul_sd(c12,FF);
			
			vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
			vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
            
			xmm1    = _mm_add_sd(fijD,fijR);
			xmm1    = _mm_mul_sd(xmm1,tabscale);
			xmm1    = _mm_add_sd(xmm1,fijGB);
			xmm1    = _mm_sub_sd(xmm1,fscal);
			fscal   = _mm_mul_sd(xmm1,neg);
			fscal   = _mm_mul_sd(fscal,rinv);

            /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
            
            /* Calculate temporary vectorial force */
            tx           = _mm_mul_sd(fscal,dx);
            ty           = _mm_mul_sd(fscal,dy);
            tz           = _mm_mul_sd(fscal,dz);
            
            /* Increment i atom force */
            fix          = _mm_add_sd(fix,tx);
            fiy          = _mm_add_sd(fiy,ty);
            fiz          = _mm_add_sd(fiz,tz);
            
            /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
		}
		
        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
        
        ggid     = gid[n];         
        
        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
	}
    
	*outeriter   = nri;            
    *inneriter   = nj1; 	
}
예제 #20
0
void kernel_dgemv_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );
	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	double *tA, *tx;

	int k;
	int ka = kmax; // number from aligned positon
	
	__m256d
		aaxx_temp, 
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77;
	
	__m128d
		ax_temp,
		a_00_10, a_01_11, a_02_12, a_03_13,
		x_0_1,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();
	y_44 = _mm256_setzero_pd();
	y_55 = _mm256_setzero_pd();
	y_66 = _mm256_setzero_pd();
	y_77 = _mm256_setzero_pd();
	
	y_0 = _mm256_castpd256_pd128(y_00);
	y_1 = _mm256_castpd256_pd128(y_11);
	y_2 = _mm256_castpd256_pd128(y_22);
	y_3 = _mm256_castpd256_pd128(y_33);
	y_4 = _mm256_castpd256_pd128(y_44);
	y_5 = _mm256_castpd256_pd128(y_55);
	y_6 = _mm256_castpd256_pd128(y_66);
	y_7 = _mm256_castpd256_pd128(y_77);

	k = lda*(ka/lda);
	tA = A + (ka/lda)*sda*lda;
	tx = x + (ka/lda)*lda;

	if(ka-k>0) // it can be only ka-k = {1, 2, 3}
		{
		if((ka-k)>=2)
			{
		
			x_0_1 = _mm_load_pd( &tx[0] );

			a_00_10 = _mm_load_pd( &tA[0+lda*0] );
			a_01_11 = _mm_load_pd( &tA[0+lda*1] );
			a_02_12 = _mm_load_pd( &tA[0+lda*2] );
			a_03_13 = _mm_load_pd( &tA[0+lda*3] );

			ax_temp = _mm_mul_pd( a_00_10, x_0_1 );	
			y_0 = _mm_add_pd (y_0, ax_temp );
			ax_temp = _mm_mul_pd( a_01_11, x_0_1 );	
			y_1 = _mm_add_pd (y_1, ax_temp );
			ax_temp = _mm_mul_pd( a_02_12, x_0_1 );	
			y_2 = _mm_add_pd (y_2, ax_temp );
			ax_temp = _mm_mul_pd( a_03_13, x_0_1 );	
			y_3 = _mm_add_pd (y_3, ax_temp );
		
			a_00_10 = _mm_load_pd( &tA[0+lda*4] );
			a_01_11 = _mm_load_pd( &tA[0+lda*5] );
			a_02_12 = _mm_load_pd( &tA[0+lda*6] );
			a_03_13 = _mm_load_pd( &tA[0+lda*7] );

			ax_temp = _mm_mul_pd( a_00_10, x_0_1 );	
			y_4 = _mm_add_pd (y_4, ax_temp );
			ax_temp = _mm_mul_pd( a_01_11, x_0_1 );	
			y_5 = _mm_add_pd (y_5, ax_temp );
			ax_temp = _mm_mul_pd( a_02_12, x_0_1 );	
			y_6 = _mm_add_pd (y_6, ax_temp );
			ax_temp = _mm_mul_pd( a_03_13, x_0_1 );	
			y_7 = _mm_add_pd (y_7, ax_temp );
		
			tA += 2;
			tx += 2;
			k+=2;
		
			}

		if((ka-k)==1)
			{
		
			x_0_1 = _mm_load_sd( &tx[0] );

			a_00_10 = _mm_load_sd( &tA[0+lda*0] );
			a_01_11 = _mm_load_sd( &tA[0+lda*1] );
			a_02_12 = _mm_load_sd( &tA[0+lda*2] );
			a_03_13 = _mm_load_sd( &tA[0+lda*3] );

			ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
			y_0 = _mm_add_sd (y_0, ax_temp );
			ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
			y_1 = _mm_add_sd (y_1, ax_temp );
			ax_temp = _mm_mul_sd( a_02_12, x_0_1 );	
			y_2 = _mm_add_sd (y_2, ax_temp );
			ax_temp = _mm_mul_sd( a_03_13, x_0_1 );	
			y_3 = _mm_add_sd (y_3, ax_temp );
		
			a_00_10 = _mm_load_sd( &tA[0+lda*4] );
			a_01_11 = _mm_load_sd( &tA[0+lda*5] );
			a_02_12 = _mm_load_sd( &tA[0+lda*6] );
			a_03_13 = _mm_load_sd( &tA[0+lda*7] );

			ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
			y_4 = _mm_add_sd (y_4, ax_temp );
			ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
			y_5 = _mm_add_sd (y_5, ax_temp );
			ax_temp = _mm_mul_sd( a_02_12, x_0_1 );	
			y_6 = _mm_add_sd (y_6, ax_temp );
			ax_temp = _mm_mul_sd( a_03_13, x_0_1 );	
			y_7 = _mm_add_sd (y_7, ax_temp );
		
			tA += 1;
			tx += 1;
			k++;
		
			}

		}

	y_00 = _mm256_castpd128_pd256(y_0);
	y_11 = _mm256_castpd128_pd256(y_1);
	y_22 = _mm256_castpd128_pd256(y_2);
	y_33 = _mm256_castpd128_pd256(y_3);
	y_44 = _mm256_castpd128_pd256(y_4);
	y_55 = _mm256_castpd128_pd256(y_5);
	y_66 = _mm256_castpd128_pd256(y_6);
	y_77 = _mm256_castpd128_pd256(y_7);
		
	k=0;
	for(; k<ka-7; k+=8)
		{

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;


		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
	for(; k<ka-3; k+=4)
		{

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
		
	__m256d
		y_0_1_2_3, y_4_5_6_7;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);
	y_44 = _mm256_hadd_pd(y_44, y_55);
	y_66 = _mm256_hadd_pd(y_66, y_77);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	
	y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 );	
	y_44 = _mm256_permute2f128_pd(y_66, y_44, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );
	y_44 = _mm256_add_pd( y_44, y_55 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		_mm256_storeu_pd(&y[4], y_44);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}

	}
예제 #21
0
bool CResizeEngine::verticalFilter(CDIBSection *src, CDIBSection *dst, ILongTimeRunCallback *pCallback) {
	assert(src->getBitCounts() == dst->getBitCounts());
	int bitcount = src->getBitCounts();
	uint src_width = src->getWidth();
	uint src_height = src->getHeight();
	uint dst_width = dst->getWidth();
	uint dst_height = dst->getHeight();
	assert(src_width == dst_width);
	src_width = src_width;
	if (src_height == dst_height) {

		unsigned char *src_bits = (unsigned char *)src->getData();
		unsigned char *dst_bits = (unsigned char *)dst->getData();
		assert(src_bits && dst_bits);

		memcpy(dst_bits, src_bits, dst_height * dst->getStride());

	} else if (!m_pFilter) { // fast (COLOR ON COLOR)

		double ratio_h = (double)src_height / (double)dst_height;

		uint bytespp = bitcount / 8;
		for (uint y = 0; y < dst_height; ++ y) {
			uint sy = (uint)(y * ratio_h + 0.5);
			if (sy >= src_height) {
				sy = src_height - 1;
			}
			uint8 *dst_data = (uint8 *)dst->getLine(y);
			uint8 *src_line = (uint8 *)src->getLine(sy);

			for (uint x = 0; x < dst_width; ++ x) {
				uint8 *src_data = src_line + x * bytespp;
				for (uint i = 0; i < bytespp; ++ i) {
					*dst_data ++ = *src_data ++;
				}
			}
		}

	} else {
#ifdef USE_SSE
		__m128i value, t;
		__m128 a, b, c, v05 = _mm_set_ps1(0.5);
#elif (defined(USE_SSE2))
		__m128i value, t;
		__m128d a, b, c, v05 = _mm_set1_pd(0.5);
#endif
		uint index; // pixel index
		CWeightsTable weightsTable(m_pFilter, dst_height, src_height);

		uint bytespp = src->getBitCounts() / 8;
		assert(bytespp == 3 || bytespp == 4);

		unsigned src_pitch = src->getStride();
		unsigned dst_pitch = dst->getStride();

		for(uint x = 0; x < dst_width; ++ x) {
			// test for stop
			if (x % 16 == 0) {
				if (pCallback && pCallback->shouldStop()) {
					return false;
				}
			}
			index = x * bytespp;

			unsigned char *dst_bits = (unsigned char *)dst->getData();
			dst_bits += index;

			for(uint y = 0; y < dst_height; ++ y) {
#ifdef USE_SSE
				__m128 v = _mm_set_ps1(0.0);
#elif defined (USE_SSE2)
				__m128d v1 = _mm_set1_pd(0.0);
				__m128d v2 = _mm_set1_pd(0.0);
#elif defined (USE_FLOAT)
				float value[4] = {0, 0, 0, 0};
#else
				double value[4] = {0, 0, 0, 0}; // 4 = 32bpp max
#endif
				int iLeft = weightsTable.getLeftBoundary(y);
				int iRight = weightsTable.getRightBoundary(y);

				uint8 *src_bits = src->getLine(iLeft);
				src_bits += index;

				for(int i = iLeft; i <= iRight; ++ i) {
#ifdef USE_SSE
					float weight = (float)weightsTable.getWeight(y, i - iLeft);
					a = _mm_set_ps1(weight);
					if (bytespp == 3) {
						t = _mm_set_epi32(0, src_bits[2], src_bits[1], src_bits[0]);
					} else {
						t = _mm_set_epi32(src_bits[3], src_bits[2], src_bits[1], src_bits[0]);
					}

					b = _mm_cvtepi32_ps(t);
					c = _mm_mul_ps(a, b);
					v = _mm_add_ps(v, c);
#elif defined(USE_SSE2)
					double weight = weightsTable.getWeight(y, i - iLeft);

					a = _mm_set1_pd(weight);
					t = _mm_set_epi32(0, 0, src_bits[1], src_bits[0]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v1 = _mm_add_pd(v1, c);

					t = _mm_set_epi32(0, 0, bytespp == 3 ? 0 : src_bits[3], src_bits[2]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v2 = _mm_add_pd(v2, c);
#elif defined (USE_FLOAT)
					float weight = (float)weightsTable.getWeight(y, i - iLeft);							
					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (float)src_bits[j]);
					}
#else
					double weight = weightsTable.getWeight(y, i - iLeft);							
					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (double)src_bits[j]);
					}
#endif

					src_bits += src_pitch;
				}

				// clamp and place result in destination pixel
#ifdef USE_SSE
				v = _mm_add_ps(v, v05);
				value = _mm_cvtps_epi32(v);
// 				__m128i flag = _mm_cmpgt_epi32(value, _mm_set1_epi32(0));
// 				value = _mm_and_si128(value, flag);
// 				dst_bits[0] = (unsigned char)MIN(255, value.m128i_i32[0]);
// 				dst_bits[1] = (unsigned char)MIN(255, value.m128i_i32[1]);
// 				dst_bits[2] = (unsigned char)MIN(255, value.m128i_i32[2]);
// 				if (bytespp == 4) {
// 					dst_bits[3] = (unsigned char)MIN(255, value.m128i_i32[3]);
// 				}
				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[2]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[3]), (int)255);
				}
#elif defined (USE_SSE2)
				v1 = _mm_add_pd(v1, v05);
				v2 = _mm_add_pd(v2, v05);
				value = _mm_cvtpd_epi32(v1);
 				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				value = _mm_cvtpd_epi32(v2);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				}
#else
				for (unsigned j = 0; j < bytespp; ++ j) {
					dst_bits[j] = (unsigned char)MIN(MAX((int)0, (int)(value[j] + 0.5)), (int)255);
				}
#endif

				dst_bits += dst_pitch;
			}
		}
	}
	return true;
}
예제 #22
0
void kernel_dgemv_t_2_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	double *tA, *tx;

	int k;
	int ka = kmax; // number from aligned positon
	
	__m256d
		aaxx_temp,
		a_00_10_20_30, a_01_11_21_31,
		x_0_1_2_3,
		y_00, y_11;
	
	__m128d
		ax_temp,
		a_00_10, a_01_11,
		x_0_1,
		y_0, y_1, y_0_1;
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	
	y_0 = _mm256_castpd256_pd128(y_00);
	y_1 = _mm256_castpd256_pd128(y_11);

	k = lda*(ka/lda);
	tA = A + (ka/lda)*sda*lda;
	tx = x + (ka/lda)*lda;

	for(; k<ka; k++)
		{
		x_0_1 = _mm_load_sd( &tx[0] );

		a_00_10 = _mm_load_sd( &tA[0+lda*0] );
		a_01_11 = _mm_load_sd( &tA[0+lda*1] );
		
/*		y_0 += a_00_10 * x_0_1;*/
		ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
		y_0 = _mm_add_sd (y_0, ax_temp );
/*		y_1 += a_01_11 * x_0_1;*/
		ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
		y_1 = _mm_add_sd (y_1, ax_temp );
		
		tA += 1;
		tx += 1;

		}

	y_00 = _mm256_castpd128_pd256(y_0);
	y_11 = _mm256_castpd128_pd256(y_1);

	k=0;
	for(; k<ka-3; k+=4)
		{
		
		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		
/*		y_00 += a_00_10_20_30 * x_0_1_2_3;*/
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
/*		y_11 += a_01_11_21_31 * x_0_1_2_3;*/
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;

		}

	y_00 = _mm256_hadd_pd(y_00, y_11);

	y_1 = _mm256_extractf128_pd(y_00, 1);
	y_0 = _mm256_castpd256_pd128(y_00);

/*	y_0 += y_1;*/
	y_0 = _mm_add_pd( y_0, y_1 );

	if(alg==0)
		{
		_mm_storeu_pd(&y[0], y_0);
		}
	else if(alg==1)
		{
		y_0_1 = _mm_loadu_pd( &y[0] );

/*		y_0_1 += y_0;*/
		y_0_1 = _mm_add_pd( y_0_1, y_0 );

		_mm_storeu_pd(&y[0], y_0_1);
		}
	else // alg==-1
		{
		y_0_1 = _mm_loadu_pd( &y[0] );
	
/*		y_0_1 -= y_0;*/
		y_0_1 = _mm_sub_pd( y_0_1, y_0 );
	
		_mm_storeu_pd(&y[0], y_0_1);
		}

	}
예제 #23
0
/*
 * Subtract off x0 & x1 contribution to all remaining equations using a
 * rank-2 update with mu=2, nu=3, ku=2.  This version is for 16 SSE regs.
 * nu is the # of RHS, ku is the number of equations solved, and mu is
 * unrolled only to enable vectorization & software pipelining of load/use.
 * Loop order is MKN, so that B is kept completely in registers, and
 * C and A are streamed in (and out, for C) from cache during the operation.
 */
ATL_SINLINE void ATL_rk2(ATL_CINT M, const TYPE *pA0, const TYPE *pA1,
                         const TYPE *pB0, const TYPE *pB1,
                         TYPE *C, ATL_CINT ldc0)
{
   ATL_CINT ldc=ldc0+ldc0;
   TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1);
   ATL_INT i;
   ATL_CINT MM = (M&1) ? M-1 : M-2;
   register __m128d B00, B10, B01, B11, B02, B12;
   register __m128d C00, C01, C02, C10, C11, C12;
   register __m128d A, a;

   B00 = _mm_load_pd(pB0);
   B10 = _mm_load_pd(pB1);
   B01 = _mm_load_pd(pB0+2);
   B11 = _mm_load_pd(pB1+2);
   B02 = _mm_load_pd(pB0+4);
   B12 = _mm_load_pd(pB1+4);    	/* iB12, rB12 */


   C00 = _mm_load_pd(pC0);
   C01 = _mm_load_pd(pC1);
   C02 = _mm_load_pd(pC2);
   A = _mm_load_pd(pA0);                		/* iA00, rA00 */
   for (i=0; i < MM; i += 2, pA0 += 4, pA1 += 4, pC0 += 4, pC1 += 4, pC2 += 4)
   {
      register __m128d b;
/*
 *    K=0, M=[0,1], apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA00*rB00, rA00*rB00 */
      C00 = _mm_add_pd(C00, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA00, iA00 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
         C10 = _mm_load_pd(pC0+2);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA1);                		/* iA01, rA01 */
/*
 *    K=0, M=0, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA00*iB00, iA00*iB00 */
      C00 = _mm_addsub_pd(C00, b);
         C11 = _mm_load_pd(pC1+2);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         C12 = _mm_load_pd(pC2+2);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
/*
 *    K=1, M=0, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA01*rB10, rA01*rB10 */
      C00 = _mm_add_pd(C00, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA01, iA01 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA0+2);                /* iA10, rA10 */
/*
 *    K=1, M=0, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA01*iB10, iA01*iB10 */
      C00 = _mm_addsub_pd(C00, b);
         _mm_store_pd(pC0, C00);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         _mm_store_pd(pC1, C01);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
         _mm_store_pd(pC2, C02);
/*
 *    K=0, M=1, apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA10*rB00, rA10*rB00 */
      C10 = _mm_add_pd(C10, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA10, iA10 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C11 = _mm_add_pd(C11, b);
         C00 = _mm_load_pd(pC0+4);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C12 = _mm_add_pd(C12, b);
         A = _mm_load_pd(pA1+2);               		/* iA11, rA11 */
/*
 *    K=0, M=1, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA10*iB00, iA10*iB00 */
      C10 = _mm_addsub_pd(C10, b);
         C01 = _mm_load_pd(pC1+4);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C11 = _mm_addsub_pd(C11, b);
         C02 = _mm_load_pd(pC2+4);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C12 = _mm_addsub_pd(C12, b);
/*
 *    K=1, M=1, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA11*rB10, rA11*rB10 */
      C10 = _mm_add_pd(C10, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA11, iA11 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C11 = _mm_add_pd(C11, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C12 = _mm_add_pd(C12, b);
         A = _mm_load_pd(pA0+4);               		/* iA20, rA20 */
/*
 *    K=1, M=1, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA11*iB10, iA11*iB10 */
      C10 = _mm_addsub_pd(C10, b);
         _mm_store_pd(pC0+2, C10);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C11 = _mm_addsub_pd(C11, b);
         _mm_store_pd(pC1+2, C11);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C12 = _mm_addsub_pd(C12, b);
         _mm_store_pd(pC2+2, C12);
   }
/*
 * Drain pipes
 */
   {
      register __m128d b;
/*
 *    K=0, M=[0,1], apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA00*rB00, rA00*rB00 */
      C00 = _mm_add_pd(C00, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA00, iA00 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA1);                		/* iA01, rA01 */
/*
 *    K=0, M=0, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA00*iB00, iA00*iB00 */
      C00 = _mm_addsub_pd(C00, b);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
/*
 *    K=1, M=0, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA01*rB10, rA01*rB10 */
      C00 = _mm_add_pd(C00, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA01, iA01 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
/*
 *    K=1, M=0, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA01*iB10, iA01*iB10 */
      C00 = _mm_addsub_pd(C00, b);
         _mm_store_pd(pC0, C00);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         _mm_store_pd(pC1, C01);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
         _mm_store_pd(pC2, C02);
      if (!(M&1))
      {
         C10 = _mm_load_pd(pC0+2);
         C11 = _mm_load_pd(pC1+2);
         C12 = _mm_load_pd(pC2+2);
         A = _mm_load_pd(pA0+2);                /* iA10, rA10 */
/*
 *       K=0, M=1, apply real components of B0x
 */
         b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
         b = _mm_mul_pd(b, A);                     /* iA10*rB00, rA10*rB00 */
         C10 = _mm_add_pd(C10, b);
            a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA10, iA10 */
         b = _mm_movedup_pd(B01);
         b = _mm_mul_pd(b, A);
         C11 = _mm_add_pd(C11, b);
         b = _mm_movedup_pd(B02);
         b = _mm_mul_pd(b, A);
         C12 = _mm_add_pd(C12, b);
            A = _mm_load_pd(pA1+2);               		/* iA11, rA11 */
/*
 *       K=0, M=1, apply imaginary components of B0x
 */
         b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
         b = _mm_mul_pd(b, a);                     /* rA10*iB00, iA10*iB00 */
         C10 = _mm_addsub_pd(C10, b);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
         b = _mm_mul_pd(b, a);
         C11 = _mm_addsub_pd(C11, b);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
         b = _mm_mul_pd(b, a);
         C12 = _mm_addsub_pd(C12, b);
/*
 *       K=1, M=1, apply real components of B1x
 */
         b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
         b = _mm_mul_pd(b, A);                     /* iA11*rB10, rA11*rB10 */
         C10 = _mm_add_pd(C10, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA11, iA11 */
         b = _mm_movedup_pd(B11);
         b = _mm_mul_pd(b, A);
         C11 = _mm_add_pd(C11, b);
         b = _mm_movedup_pd(B12);
         b = _mm_mul_pd(b, A);
         C12 = _mm_add_pd(C12, b);
/*
 *       K=1, M=1, apply imaginary components of B1x
 */
         b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
         b = _mm_mul_pd(b, a);                     /* rA11*iB10, iA11*iB10 */
         C10 = _mm_addsub_pd(C10, b);
            _mm_store_pd(pC0+2, C10);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
         b = _mm_mul_pd(b, a);
         C11 = _mm_addsub_pd(C11, b);
            _mm_store_pd(pC1+2, C11);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
         b = _mm_mul_pd(b, a);
         C12 = _mm_addsub_pd(C12, b);
            _mm_store_pd(pC2+2, C12);
      }
   }
}
예제 #24
0
// add *p by *s and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_add(double *p, const double *s, size_t n)
{
#if defined(COREARRAY_SIMD_AVX)

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x10:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x18:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_add_pd(_mm256_load_pd(p), _mm256_loadu_pd(s)));
			p += 4; s += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_add_pd(_mm256_loadu_pd(p), _mm256_loadu_pd(s)));
			p += 4; s += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x00:
		for (; n >= 2; n-=2)
		{
			_mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2;
		}
		break;
	default:
		for (; n >= 2; n-=2)
		{
			_mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2;
		}
	}

#endif

	for (; n > 0; n--) (*p++) += (*s++);
}